-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy path3E_concatenate_text.py
56 lines (50 loc) · 1.07 KB
/
3E_concatenate_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""
Concatenate smaller text files to work around GitHub‘s limit of maximum 1000 files per directory.
"""
import os
import glob
from tqdm import tqdm
from shutil import move
LANGUAGES = [
'cs',
'cz',
'de',
'en',
'es',
'et',
'fi',
'fr',
'hu',
'it',
'nl',
'no',
'pl',
'pt',
'se',
'sv',
"da",
"hr",
"sl",
"lt",
"tr",
"lv",
"ro",
"sk",
"sq",
]
for LANGUAGE in LANGUAGES:
print(LANGUAGE)
outputCount = 0
fileSize = 0
text = ''
for path in tqdm(glob.glob("text/" + LANGUAGE + "/*.txt")):
with open(path, "r") as inputFile:
fileSize += os.fstat(inputFile.fileno()).st_size
text += '\n' + inputFile.read()
if fileSize > 200000:
outputCount += 1
fileSize = 0
with open("text/" + LANGUAGE + "/grouped-" + str(outputCount) + ".txt", "w") as outputFile:
outputFile.write(text) # write file content
text = ''
os.remove(path)