数据集:
ccdv/WCEP-10
Summarization dataset copied from PRIMERA
This dataset is compatible with the run_summarization.py script from Transformers if you add this line to the summarization_name_mapping variable:
"ccdv/WCEP-10": ("document", "summary")
4 possibles configs:
This dataset has 3 splits: train , validation , and test . \
| Dataset Split | Number of Instances |
|---|---|
| Train | 8158 |
| Validation | 1020 |
| Test | 1022 |
@article{DBLP:journals/corr/abs-2005-10070,
author = {Demian Gholipour Ghalandari and
Chris Hokamp and
Nghia The Pham and
John Glover and
Georgiana Ifrim},
title = {A Large-Scale Multi-Document Summarization Dataset from the Wikipedia
Current Events Portal},
journal = {CoRR},
volume = {abs/2005.10070},
year = {2020},
url = {https://arxiv.org/abs/2005.10070},
eprinttype = {arXiv},
eprint = {2005.10070},
timestamp = {Fri, 22 May 2020 16:21:28 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2005-10070.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2110-08499,
author = {Wen Xiao and
Iz Beltagy and
Giuseppe Carenini and
Arman Cohan},
title = {{PRIMER:} Pyramid-based Masked Sentence Pre-training for Multi-document
Summarization},
journal = {CoRR},
volume = {abs/2110.08499},
year = {2021},
url = {https://arxiv.org/abs/2110.08499},
eprinttype = {arXiv},
eprint = {2110.08499},
timestamp = {Fri, 22 Oct 2021 13:33:09 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2110-08499.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}