OASST1 dataset preprocessed for reward modeling:
import pandas as pd from datasets import load_dataset,concatenate_datasets, Dataset, DatasetDict import numpy as np dataset = load_dataset("OpenAssistant/oasst1") df=concatenate_datasets(list(dataset.values())).to_pandas() m2t=df.set_index("message_id")['text'].to_dict() m2r=df.set_index("message_id")['role'].to_dict() m2p=df.set_index('message_id')['parent_id'].to_dict() m2history=dict() # message id to unrolled history for k,v in m2p.items(): history=[k] while history[-1] in m2p: history+=[m2p[history[-1]]] m2history[k]="\n".join([f"{m2r[m]}: {m2t[m]}" for m in history[::-1] if m]) d=dict() for split in "train","validation": df=dataset[split].to_pandas() df['prompt']=df.parent_id.map(lambda x: m2history.get(x,'')) df=df[~df['rank'].isna()] def agg(x): x=list(x) return [x[0],x[-1]] df=df.groupby(['prompt',"parent_id",'lang'])[['text','rank']].agg(agg).reset_index() df=df[df['rank'].map(lambda x:len(set(x))>1)] df['chosen'] = df.apply(lambda x:x['text'][np.argmin(x['rank'])],axis=1) df['rejected'] = df.apply(lambda x:x['text'][np.argmax(x['rank'])],axis=1) d[split]=Dataset.from_pandas(df[['lang','parent_id','prompt','chosen','rejected']],preserve_index=False) DatasetDict(d).push_to_hub('tasksource/oasst1_pairwise_rlhf_reward')