数据集:
tasksource/oasst1_dense_flat
许可:
OASST1 dataset But where with retrieved parent_text, and where we only keep messages with dense annotations (all labels have 2 annotators)
from datasets import Dataset, DatasetDict
d={}
for split in ['train','validation']:
df=load_dataset("OpenAssistant/oasst1")[split].to_pandas()
m2t=df.set_index("message_id")['text'].to_dict()
df['parent_text']=df.parent_id.map(lambda x: m2t.get(x,''))
df=df[df.labels.map(lambda x:x!=None)]
df=df[df.labels.map(lambda x:x['count'].min()>2)]
labels=df.labels.map(lambda x:list(x['name'])).value_counts().index[0]
df=df[df.labels.map(lambda x:x!=None)]
df=df[df.labels.map(lambda x:list(x['name'])==labels)]
for label in labels:
df[label]=df.labels.map(lambda x: x['value'][list(x['name']).index(label)])
d[split]=Dataset.from_pandas(df,preserve_index=False)
DatasetDict(d).push_to_hub('oasst1_dense_flat')
https://github.com/LAION-AI/Open-Assistant
@article{kopf2023openassistant,
title={OpenAssistant Conversations--Democratizing Large Language Model Alignment},
author={K{\"o}pf, Andreas and Kilcher, Yannic and von R{\"u}tte, Dimitri and Anagnostidis, Sotiris and Tam, Zhi-Rui and Stevens, Keith and Barhoum, Abdullah and Duc, Nguyen Minh and Stanley, Oliver and Nagyfi, Rich{\'a}rd and others},
journal={arXiv preprint arXiv:2304.07327},
year={2023}
}