-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy path3A_fetch_text.py
79 lines (64 loc) · 2.38 KB
/
3A_fetch_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""
Fetch texts from Wikipedia by category.
I used the categories "Featured Articles" and/or "Quality Articles" to get longer
and well approved articles only. This does not work for every language.
You can add more categories to the dictionary and more entries to the list of languages
to try it yourself. Depending on the number of articles and your internet connection speed
this may take up to an hour.
"""
import os
import timeit
import wikipediaapi # https://github.com/martin-majlis/Wikipedia-API/
import wikipedia
from slugify import slugify
from tqdm import tqdm
start = timeit.default_timer()
LANGUAGES = [
"sk",
]
CATEGORY = {
# 'sk': 'Wikipédia:Dobré články',
"sk": "Wikipédia:Najlepšie články"
# 'ro': 'Wikipedia articole de calitate',
# 'lv': 'Vērtīgi raksti',
# 'lt': 'Vertingi straipsniai',
# 'hr': 'Izabrani članci',
# 'da': 'Ugens artikel',
# 'de': 'Wikipedia:Lesenswert',
# 'en': 'Featured articles',
# 'fr': 'Article de qualité',
# 'es': 'Wikipedia:Artículos_destacados',
# 'pt': '!Artigos bons',
# 'it': 'Wikipedia:Voci_di_qualità',
# 'nl': 'Wikipedia:Uitgelicht'
# 'pl': 'Artykuły na medal',
}
def make_directory(language):
if not os.path.exists(language):
os.makedirs(language)
def fetch_categorymembers(categorymembers, level=0, max_level=1):
for page in tqdm(categorymembers.values()):
file = open("text/" + LANGUAGE + "/" + slugify(page.title) + ".txt", "w")
file.write(page.text)
### TEMPORARY
# title = page.title.replace('Diskussion:', '')
# try:
# text = wikipedia.WikipediaPage(title).content
# file = open('text/' + LANGUAGE + '/' + slugify(title) + '.txt', 'w')
# file.write(text)
# except wikipedia.exceptions.DisambiguationError as e:
# print('exception.DisambiguationError')
### TEMPORARY
for LANGUAGE in tqdm(LANGUAGES):
wiki_wiki = wikipediaapi.Wikipedia(
language=LANGUAGE, extract_format=wikipediaapi.ExtractFormat.WIKI
)
### TEMPORARY
# wikipedia.set_lang(LANGUAGE)
### TEMPORARY
category = wiki_wiki.page("Category:" + CATEGORY.get(LANGUAGE))
# print(category.categorymembers)
make_directory("text/" + LANGUAGE)
fetch_categorymembers(category.categorymembers)
stop = timeit.default_timer()
print("Execution Time: ", stop - start)