Skip to content

Commit

Permalink
New languages
Browse files Browse the repository at this point in the history
  • Loading branch information
tkrajina committed Jan 27, 2018
1 parent d768521 commit dd448d3
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 60 deletions.
29 changes: 16 additions & 13 deletions download_files.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,33 @@ mkdir -p bucket_files
mkdir -p raw_files
cd raw_files

wget http://opus.nlpl.eu/download.php?f=OpenSubtitles2018%2Fen-sl.tmx.gz -O en-sl.tmx.gz
wget -c http://opus.nlpl.eu/download.php?f=OpenSubtitles2018%2Fen-sl.tmx.gz -O en-sl.tmx.gz
gzip -d en-sl.tmx.gz

wget http://www.statmt.org/europarl/v7/et-en.tgz
gzip -d et-en.tgz
tar -xvf et-en.tar
wget -c http://opus.nlpl.eu/download.php?f=OpenSubtitles2018%2Fen-et.tmx.gz -O en-et.tmx.gz
gzip -d en-et.tmx.gz

wget http://www.statmt.org/europarl/v7/lv-en.tgz
gzip -d lv-en.tgz
tar -xvf lv-en.tar
wget -c http://opus.nlpl.eu/download.php?f=OpenSubtitles2018%2Fen-lv.tmx.gz -O en-lv.tmx.gz
gzip -d en-lv.tmx.gz

wget http://www.statmt.org/europarl/v7/sk-en.tgz
gzip -d sk-en.tgz
tar -xvf sk-en.tar
wget -c http://opus.nlpl.eu/download.php?f=OpenSubtitles2018%2Fen-sk.tmx.gz -O en-sk.tmx.gz
gzip -d en-sk.tmx.gz

wget http://downloads.tatoeba.org/exports/links.tar.bz2
wget -c http://opus.nlpl.eu/download.php?f=OpenSubtitles2018%2Fen-hr.tmx.gz -O en-hr.tmx.gz
gzip -d en-hr.tmx.gz

wget -c http://opus.nlpl.eu/download.php?f=OpenSubtitles2018%2Fen-sq.tmx.gz -O en-sq.tmx.gz
gzip -d en-sq.tmx.gz

wget -c http://downloads.tatoeba.org/exports/links.tar.bz2
bzip2 -d links.tar.bz2
tar -xvf links.tar

wget http://downloads.tatoeba.org/exports/sentences.tar.bz2
wget -c http://downloads.tatoeba.org/exports/sentences.tar.bz2
bzip2 -d sentences.tar.bz2
tar -xvf sentences.tar

wget http://downloads.tatoeba.org/exports/sentences_detailed.tar.bz2
wget -c http://downloads.tatoeba.org/exports/sentences_detailed.tar.bz2
bzip2 -d sentences_detailed.tar.bz2
tar -xvf sentences_detailed.tar

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,60 +26,65 @@ public static void main(String[] args) throws Exception {
List<Importer> importers = new ArrayList<>();

String[][] tatoebaLanguagePairs = new String[][]{
//new String[] {"pes", "eng"},
// new String[] {"nob", "eng"},
// new String[] {"ces", "eng"},
// new String[] {"mkd", "eng"},
// new String[] {"ces", "eng"},
// new String[] {"bul", "eng"},
// new String[] {"srp", "eng"},
// new String[] {"dan", "eng"},
// new String[] {"swe", "eng"},
// new String[] {"ukr", "eng"},
// new String[] {"nld", "eng"},
// new String[] {"fin", "eng"},
// new String[] {"mkd", "eng"},
// new String[] {"hun", "eng"},
// new String[] {"pol", "eng"},
// new String[] {"ita", "eng"},
// new String[] {"epo", "eng"},
// new String[] {"lat", "eng"},
// new String[] {"tur", "eng"},
// new String[] {"ell", "eng"},
// new String[] {"ron", "eng"},
// new String[] {"ara", "eng"},
// new String[] {"heb", "eng"},
// new String[] {"deu", "eng"},
// new String[] {"fra", "eng"},
// new String[] {"rus", "eng"},
// new String[] {"por", "eng"},
// new String[] {"spa", "eng"},
// new String[] {"lit", "eng"},
//
// // Nonenglish collections:
// new String[] {"spa", "fra"},
// new String[] {"deu", "ita"},
new String[]{"pes", "eng"},
new String[]{"nob", "eng"},
new String[]{"ces", "eng"},
new String[]{"mkd", "eng"},
new String[]{"ces", "eng"},
new String[]{"bul", "eng"},
new String[]{"srp", "eng"},
new String[]{"dan", "eng"},
new String[]{"swe", "eng"},
new String[]{"ukr", "eng"},
new String[]{"nld", "eng"},
new String[]{"fin", "eng"},
new String[]{"mkd", "eng"},
new String[]{"hun", "eng"},
new String[]{"pol", "eng"},
new String[]{"ita", "eng"},
new String[]{"epo", "eng"},
new String[]{"lat", "eng"},
new String[]{"tur", "eng"},
new String[]{"ell", "eng"},
new String[]{"ron", "eng"},
new String[]{"ara", "eng"},
new String[]{"heb", "eng"},
new String[]{"deu", "eng"},
new String[]{"fra", "eng"},
new String[]{"rus", "eng"},
new String[]{"por", "eng"},
new String[]{"spa", "eng"},
new String[]{"lit", "eng"},

// Nonenglish collections:
new String[]{"spa", "fra"},
new String[]{"deu", "ita"},
};
for (String[] tatoebaLanguagePair : tatoebaLanguagePairs) {
importers.add(new TatoebaImporter(tatoebaLanguagePair[0], tatoebaLanguagePair[1], tatoebaLanguagePairs));
importers.add(new TatoebaImporter(tatoebaLanguagePair[1], tatoebaLanguagePair[0], tatoebaLanguagePairs));
}

String[][] euImporters = new String[][]{
/* {"est", "eng", "europarl-v7.et-en"},
{"lav", "eng", "europarl-v7.lv-en"},
{"slk", "eng", "europarl-v7.sk-en"},*/
String[][] tmxImporters = new String[][]{
{"slv", "eng", "en-sl.tmx"},
{"est", "eng", "en-et.tmx"},
{"lav", "eng", "en-lv.tmx"},
{"slk", "eng", "en-sk.tmx"},
{"hrv", "eng", "en-hr.tmx"},
{"sqi", "eng", "en-sq.tmx"},
};
for (String[] eu : euImporters) {
for (String[] eu : tmxImporters) {
String lang1 = eu[0];
String lang2 = eu[1];
String baseFilename = eu[2];
importers.add(new EuImporter(lang1, lang2, baseFilename));
importers.add(new EuImporter(lang2, lang1, baseFilename));
importers.add(new TMXImporter(lang1, lang2, baseFilename));
importers.add(new TMXImporter(lang2, lang1, baseFilename));
}

importers.add(new TMXImporter("slv", "eng", "en-sl.tmx"));
importers.add(new TMXImporter("eng", "slv", "en-sl.tmx"));
// Slovene:

// Estonian:


InfoVO info = new InfoVO()
.setLanguages(Languages.getLanguages());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ public void importCollection(SentenceWriter writer) throws Exception {
final WordCounter counter = new WordCounter();
final List<SentenceVO> sentences = new ArrayList<>();
final Set<Integer> knownSenteceHashes = new HashSet<>();
final Set<Integer> targetSenteceHashes = new HashSet<>();


try {
Expand Down Expand Up @@ -114,18 +115,20 @@ public void endElement(String name) throws SAXException {
// System.out.println(sentenceTranslations);
// System.out.println(knownLang);
// System.out.println(targetLang);
if (sentences.size() < 40_000) {
if (sentences.size() < 60_000) {
String knownLine = sentenceTranslations.get(knownLang.getAbbrev());
String targetLine = sentenceTranslations.get(targetLang.getAbbrev());
if (StringUtils.isNotEmpty(targetLine) && StringUtils.isNotEmpty(knownLine)) {
if (Character.isUpperCase(targetLine.charAt(0)) && Character.isUpperCase(knownLine.charAt(0))) {
String id = String.format("%s-%s-%d", knownLang.getAbbrev(), targetLang.getAbbrev(), targetLine.hashCode());
SentenceVO s = new SentenceVO().setSentenceId(String.valueOf(id)).setKnownSentence(knownLine).setTargetSentence(targetLine);
Integer h = s.getKnownSentence().hashCode();
if (!knownSenteceHashes.contains(h) && sentenceOK(s)) {
Integer knownHash = s.getKnownSentence().hashCode();
Integer targetHash = s.getTargetSentence().hashCode();
if (!targetSenteceHashes.contains(targetHash) && !knownSenteceHashes.contains(knownHash) && sentenceOK(s)) {
sentences.add(s);
counter.countWordsInSentence(s, knownLang, targetLang);
knownSenteceHashes.add(h);
knownSenteceHashes.add(knownHash);
targetSenteceHashes.add(targetHash);
if (sentences.size() % 1000 == 0) {
System.out.println(String.format("%d sentences", sentences.size()));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@ public void importCollection(SentenceWriter writer) throws Exception {

System.out.println(String.format("Found %d known language sentences", knownLanguageSentences.size()));
System.out.println(String.format("Found %d target language sentences", targetLanguageSentences.size()));
System.out.println(String.format("%d distinct words, %d words", wordCounter.size(), wordCounter.getCount().intValue()));

List<SentenceVO> sentences = new ArrayList<>();

Expand Down Expand Up @@ -168,6 +167,8 @@ public void importCollection(SentenceWriter writer) throws Exception {
}
}

System.out.println(String.format("%d distinct words, %d words", wordCounter.size(), wordCounter.getCount().intValue()));

// Order by id, so that older ids are deployed in the database (they are more likely to be
// without errors:
Collections.sort(sentences, new Comparator<SentenceVO>() {
Expand Down

0 comments on commit dd448d3

Please sign in to comment.