@inproceedings{elkishky_ccaligned_2020, author = {El-Kishky, Ahmed and Chaudhary, Vishrav and Guzm{\'a}n, Francisco and Koehn, Philipp}, booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP 2020)}, month = {November}, title = {{CCAligned}: A Massive Collection of Cross-lingual Web-Document Pairs}, year = {2020} address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.emnlp-main.480", doi = "10.18653/v1/2020.emnlp-main.480", pages = "5960--5969" }
Domain \tab Source_URL \tab Source_Content \tab Target_URL \tab Target_Content
To obtain aligned documents from non-English to non-English languages, one should simply join two English-aligned document pairs on the English (source_url).
For example, if you have two URLs indicating a pair from English to Arabic (english.test.com and arabic.test.com) as well as two URLs indicating a pair from English to French (english.test.com and french.test.com), you can join on the English URL to create (arabic.test.com and french.test.com).
af_ZA (193M) ak_GH (390K) am_ET (65M) ar_AR (2.0G) as_IN (3.3M) ay_BO (845K) az_AZ (282M) be_BY (220M) bg_BG (1.5G) bm_ML (141K) bn_IN (499M) br_FR (4.4M) bs_BA (50M) ca_ES (842M) cb_IQ (3.9M) cs_CZ (1.9G) cx_PH (20M) cy_GB (229M) de_DE (18G) |
dv_MV (1.2M) el_GR (2.7G) eo_EO (71M) es_XX (16G) fa_IR (883M) ff_NG (280K) fi_FI (1.7G) fo_FO (2.3M) fr_XX (21G) fy_NL (34M) ga_IE (144M) gl_ES (137M) gn_PY (713K) gu_IN (89M) he_IL (894M) hi_IN (1.5G) hr_HR (1.1G) hu_HU (2.0G) id_ID (1.6G) |
ig_NG (32M) is_IS (168M) it_IT (11G) iu_CA (2.6M) ja_XX (5.1G) ka_GE (276M) kg_AO (99K) kk_KZ (164M) km_KH (93M) kn_IN (98M) ko_KR (2.7G) ku_TR (34M) ky_KG (47M) la_VA (72M) lg_UG (471K) li_NL (1.4M) ln_CD (900K) lo_LA (71M) lt_LT (792M) |
lv_LV (799M) mg_MG (37M) mi_NZ (33M) mk_MK (321M) ml_IN (86M) mn_MN (97M) mr_IN (90M) ms_MY (875M) mt_MT (121M) my_MM (69M) my_MM_zaw (8.8M) ne_NP (71M) nl_XX (6.3G) no_XX (1.4G) ns_ZA (1.1M) ny_MW (28M) om_KE (766K) or_IN (2.6M) pa_IN (77M) |
pl_PL (4.5G) ps_AF (40M) pt_XX (6.1G) qa_MM (33K) qd_MM (87K) rm_CH (2.6M) ro_RO (1.5G) ru_RU (14G) rw_RW (1.8M) sc_IT (891K) sd_PK (37M) se_NO (935K) si_LK (80M) sk_SK (1.2G) sl_SI (709M) sn_ZW (20M) so_SO (47M) sq_AL (295M) sr_RS (329M) |
ss_SZ (495K) st_ZA (490K) su_ID (37M) sv_SE (2.1G) sw_KE (104M) sy_SY (6.3M) sz_PL (9.2K) ta_IN (185M) te_IN (93M) tg_TJ (57M) th_TH (2.0G) ti_ET (2.5M) tl_XX (281M) tn_BW (1001K) tr_TR (3.1G) ts_ZA (854K) tt_RU (9.8M) tz_MA (12K) ug_CN (1.4M) |
uk_UA (1.2G) ur_PK (212M) uz_UZ (134M) ve_ZA (385K) vi_VN (1.5G) wo_SN (856K) wy_PH (753K) xh_ZA (27M) yi_DE (76M) yo_NG (39M) zh_CN (3.9G) zh_TW (1.2G) zu_ZA (43M) zz_TR (123K) |
For more on sentence pair mining method, see (pdf):
@InProceedings{chaudhary-EtAl:2019:WMT, author = {Chaudhary, Vishrav and Tang, Yuqing and Guzmán, Francisco and Schwenk, Holger and Koehn, Philipp}, title = {Low-Resource Corpus Filtering Using Multilingual Sentence Embeddings}, booktitle = {Proceedings of the Fourth Conference on Machine Translation (Volume 3: Shared Task Papers, Day 2)}, month = {August}, year = {2019}, address = {Florence, Italy}, publisher = {Association for Computational Linguistics}, pages = {263--268}, url = {http://www.aclweb.org/anthology/W19-5435} }
The format of this data is:
Source_Sentence \tab Target_Sentence \tab LASER_similarity
af_ZA (75M) ak_GH (18K) am_ET (18M) ar_AR (1.3G) as_IN (829K) ay_BO (34K) az_AZ (47M) az_IR (15K) be_BY (70M) bg_BG (458M) bm_ML (5.2K) bn_IN (147M) br_FR (2.4M) bs_BA (20M) ca_ES (274M) cb_IQ (1.3M) cs_CZ (504M) |
cx_PH (9.1M) cy_GB (35M) da_DK (418M) de_DE (4.7G) el_GR (340M) es_XX (4.3G) et_EE (180M) fa_IR (238M) ff_NG (1.4M) fi_FI (392M) fr_XX (5.1G) gu_IN (7.6M) ha_NG (12M) he_IL (196M) hi_IN (426M) hr_HR (362M) ht_HT (25M) hu_HU (440M) |
hy_AM (52M) id_ID (759M) ig_NG (7.3M) is_IS (57M) it_IT (2.7G) ja_XX (1.2G) jv_ID (24M) ka_GE (64M) kg_AO (2.2K) kk_KZ (33M) km_KH (21M) kn_IN (7.6M) ko_KR (361M) ku_TR (8.3M) ky_KG (12M) lg_UG (271K) ln_CD (413K) lo_LA (6.9M) lt_LT (221M) |
lv_LV (214M) mg_MG (19M) mi_NZ (6.3M) mk_MK (83M) ml_IN (34M) mn_MN (20M) mr_IN (35M) ms_MY (216M) mt_MT (664) my_MM (15M) ne_NP (24M) nl_XX (1.6G) no_XX (356M) ns_ZA (388K) ny_MW (6.8M) om_KE (394K) or_IN (318K) pa_IN (7.2M) pl_PL (1.1G) |
ps_AF (13M) pt_XX (1.9G) qa_MM (7.1K) qd_MM (8.8K) ro_RO (441M) ru_RU (3.7G) si_LK (31M) sk_SK (288M) sl_SI (187M) sn_ZW (5.3M) so_SO (12M) sq_AL (92M) sr_RS (134M) ss_SZ (527K) st_ZA (63K) su_ID (19M) sv_SE (528M) sw_KE (65M) sz_PL (684) |
ta_IN (46M) te_IN (32M) tg_TJ (12M) th_TH (533M) ti_ET (537K) tl_XX (150M) tn_BW (1.3M) tr_TR (801M) ts_ZA (105K) tz_MA (2.4K) uk_UA (440M) ur_PK (65M) ve_ZA (81K) vi_VN (459M) wo_SN (1.6M) xh_ZA (6.2M) |
yo_NG (9.3M) zh_CN (611M) zh_TW (260M) zu_ZA (5.8M) zz_TR (1.4K) |