forked from bitextor/bitextor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
basic.yaml
40 lines (32 loc) · 863 Bytes
/
basic.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# BASIC VARIABLES
dataDir: ~/permanent/data
permanentDir: ~/permanent
transientDir: ~/transient
tempDir: ~/transient
# DATA SOURCES - CRAWLING
hostsFile: ~/hosts.gz # contains a domain to crawl per line
crawler: "wget"
crawlTimeLimit: "86400s" # 24 hours
# DATA SOURCES - WARC FILES
warcsFile: ~/warcs.gz # contains a path to a WARC file per line
# PREPROCESSING
preprocessor: "warc2text"
shards: 8 # 2^8 = 256 shards
batches: 1024 # each shard split into chunks of 1024 MB
lang1: 'en'
lang2: 'fr'
# ALIGN
documentAligner: "externalMT"
documentAlignerThreshold: 0.1
alignerCmd: "bash /path/to/your/translate.sh"
translationDirection: fr2en
sentenceAligner: 'bleualign'
sentenceAlignerThreshold: 0.1
# CLEANING
bifixer: True
bicleaner: True
bicleanerModel: ~/bicleaner-model/en-fr/en-fr.yaml
bicleanerThreshold: 0.5
biroamer: False
deduped: True
tmx: True