Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v0.4.2 (wmt24) #161

Merged
merged 1 commit into from
May 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion mtdata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Created: 4/4/20


__version__ = '0.4.1'
__version__ = '0.4.2'
__description__ = 'mtdata is a tool to download datasets for machine translation'
__author__ = 'Thamme Gowda'

Expand Down
22 changes: 14 additions & 8 deletions mtdata/index/paracrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,16 +82,22 @@ def load_all(index: Index):
url=f'{URL_PREFIX}/bonus/en-uk-v1.txt.gz',
cite=cite, ext='tsv.gz'))

# Japanese-English paracrawl (5.1) used by WMT20 and WMT21
# Japanese-English paracrawl (5.1) used by WMT20 ...
for version, cols in [('2', (2, 3)), ('3', (3, 4))]:
ent = Entry(did=DatasetId(group='KECL', name=f'paracrawl', version=version, langs=('eng', 'jpn')),
in_paths=['en-ja/en-ja.bicleaner05.txt'], in_ext='tsv', cols=cols, cite='',
url=f'http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/{version}.0/bitext/en-ja.tar.gz')
index.add_entry(ent)

# JParaCrawl Chinese-Japanese, only version 2 is available
if version == '2':
ent = Entry(did=DatasetId(group='KECL', name=f'paracrawl', version=version, langs=('zho', 'jpn')),
in_paths=['zh-ja/zh-ja.bicleaner05.txt'], in_ext='tsv', cols=cols, cite='',
url=f'http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/{version}.0/bitext/zh-ja.tar.gz')
index.add_entry(ent)

# JParaCrawl Chinese-Japanese, v2: cols=2,3
ent = Entry(did=DatasetId(group='KECL', name=f'paracrawl', version='2', langs=('zho', 'jpn')),
in_paths=['zh-ja/zh-ja.bicleaner05.txt'], filename='jparacrawl-2.0-zh-ja.tar.gz',
in_ext='tsv', cols=(2, 3), cite=('morishita-etal-2022-jparacrawl',),
url=f'http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/zh/2.0/bitext/zh-ja.tar.gz')
index.add_entry(ent)
# v2wmt24: columns=(3, 4)
ent = Entry(did=DatasetId(group='KECL', name=f'paracrawl', version=f'2wmt24', langs=('zho', 'jpn')),
in_paths=['zh-ja/zh-ja.crowdsourcing_b05l07.txt'], filename='jparacrawl-2.0-zh-ja.tar.gz',
in_ext='tsv', cols=(3, 4), cite=('nagata2024japanesechinese',),
url=f'http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/zh/2.0/bitext/zh-ja.tar.gz')
index.add_entry(ent)
5 changes: 4 additions & 1 deletion mtdata/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ def echo_data(did:DatasetId, delim='\t'):
path = cache.get_entry(entry)
parser = Parser(path, ext=entry.in_ext or None, ent=entry)
count = 0
for rec in parser.read_segs():
all_segs = parser.read_segs()
for rec in all_segs:
if isinstance(rec, (list, tuple)):
rec = (col.replace(delim, ' ').replace('\n', ' ') for col in rec)
rec = delim.join(rec)
Expand Down Expand Up @@ -323,6 +324,8 @@ def main():
elif args.task == 'get':
get_data(**vars(args))
elif args.task == 'echo':
# disable progress bar for echo; it sometimes insert new lines in the output
pbar_man.enabled = False
echo_data(did=args.dataset_id)
elif args.task == 'list-recipe':
list_recipes(id_only=args.id, format=args.format)
Expand Down
28 changes: 27 additions & 1 deletion mtdata/resource/refs.bib
Original file line number Diff line number Diff line change
Expand Up @@ -693,4 +693,30 @@ @inproceedings{goldhahn-etal-2012-building
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/327_Paper.pdf",
pages = "759--765",
}
}

%%% Japanese paracrawl
@inproceedings{morishita-etal-2022-jparacrawl,
title = "{JP}ara{C}rawl v3.0: A Large-scale {E}nglish-{J}apanese Parallel Corpus",
author = "Morishita, Makoto and
Chousa, Katsuki and
Suzuki, Jun and
Nagata, Masaaki",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.lrec-1.721",
pages = "6704--6710",
}


@misc{nagata2024japanesechinese,
title={A Japanese-Chinese Parallel Corpus Using Crowdsourcing for Web Mining},
author={Masaaki Nagata and Makoto Morishita and Katsuki Chousa and Norihito Yasuda},
year={2024},
eprint={2405.09017},
archivePrefix={arXiv},
primaryClass={cs.CL},
}
Loading