数据集:
tasksource/oasst1_dense_flat
许可:
OASST1 dataset But where with retrieved parent_text, and where we only keep messages with dense annotations (all labels have 2 annotators)
from datasets import Dataset, DatasetDict
d={}
for split in ['train','validation']:
    df=load_dataset("OpenAssistant/oasst1")[split].to_pandas()
    m2t=df.set_index("message_id")['text'].to_dict()
    df['parent_text']=df.parent_id.map(lambda x: m2t.get(x,''))
    df=df[df.labels.map(lambda x:x!=None)]
    df=df[df.labels.map(lambda x:x['count'].min()>2)]
    labels=df.labels.map(lambda x:list(x['name'])).value_counts().index[0]
    df=df[df.labels.map(lambda x:x!=None)]
    df=df[df.labels.map(lambda x:list(x['name'])==labels)]
    for label in labels:
        df[label]=df.labels.map(lambda x: x['value'][list(x['name']).index(label)])
    d[split]=Dataset.from_pandas(df,preserve_index=False)
    
DatasetDict(d).push_to_hub('oasst1_dense_flat')
 https://github.com/LAION-AI/Open-Assistant
@article{kopf2023openassistant,
  title={OpenAssistant Conversations--Democratizing Large Language Model Alignment},
  author={K{\"o}pf, Andreas and Kilcher, Yannic and von R{\"u}tte, Dimitri and Anagnostidis, Sotiris and Tam, Zhi-Rui and Stevens, Keith and Barhoum, Abdullah and Duc, Nguyen Minh and Stanley, Oliver and Nagyfi, Rich{\'a}rd and others},
  journal={arXiv preprint arXiv:2304.07327},
  year={2023}
}