This repository has been archived by the owner on Aug 28, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
README
77 lines (59 loc) · 3.23 KB
/
README
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
see http://matpalm.com/blog/2011/08/13/wikipedia-philosophy
-----
# get data
cd data
wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 # 9gb
# flatten to single line
bzcat enwiki-latest-pages-articles.xml.bz2 | ~/flatten_to_one_page_per_line.py > enwiki-latest-pages-articles.pageperline.xml # 30gb
# split into redirects and articles
cat enwiki-latest-pages-articles.pageperline.xml | grep \<redirect\ \/\> > enwiki-latest-pages-redirects.xml &
cat enwiki-latest-pages-articles.pageperline.xml | grep -v \<redirect\ \/\> > enwiki-latestXS-pages-articles.xml &
wait
# move xml for articles and redirects into hdfs
hadoop fs -mkdir /full/articles.xml
hadoop fs -copyFromLocal enwiki-latest-pages-articles.xml /full/articles.xml
hadoop fs -mkdir /full/redirects.xml
hadoop fs -copyFromLocal enwiki-latest-pages-redirects.xml /full/redirects.xml
# parse redirects
cd
hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
-input /full/redirects.xml -output /full/redirects \
-mapper redirect_parser.py -file redirect_parser.py
# dereference all redirects to their final targets
pig -p INPUT=/full/redirects -p OUTPUT=/full/redirects.dereferenced1 -f dereference_redirects.pig
pig -p INPUT=/full/redirects.dereferenced1 -p OUTPUT=/full/redirects.dereferenced2 -f dereference_redirects.pig
pig -p INPUT=/full/redirects.dereferenced2 -p OUTPUT=/full/redirects.dereferenced3 -f dereference_redirects.pig
pig -p INPUT=/full/redirects.dereferenced3 -p OUTPUT=/full/redirects.dereferenced4 -f dereference_redirects.pig
hfs -mv /full/redirects /full/redirects.original
hfs -mv /full/redirects.dereferenced4 /full/redirects
# run extraction
hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
-input /full/articles.xml -output /full/edges \
-mapper article_parser.py -file article_parser.py
# run redirects against edges
pig -p INPUT=/full/edges -p OUTPUT=/full/edges.dereferenced -f dereference_redirects.pig
# as a sanity check, should be same
# pig -p INPUT=/full/edges.dereferenced -p OUTPUT=/full/edges.dereferenced.sanity -f dereference_redirects.pig
# get to local filesystem
hadoop fs -cat /full/edges.dereferenced/* > edges
# hadoop fs -cat /full/edges.dereferenced.sanity/* > edges.sanity
# add special cases, the parser will, alas, never be perfect...
cat manually_derived_edges >> edges
# calculate distance from Philosophy
java -Xmx8g -cp distanceToPhilosophy/bin/ DistanceToPhilosophy \
Philosophy edges \
>DistanceToPhilosophy.stdout 2>DistanceToPhilosophy.stderr
# order nodes by their number of descendants
grep ^FINAL DistanceToPhilosophy.stdout | sed -es/FINAL\ // > distances
cut -f1 distances > articles
hfs -mkdir /articles_that_led_to_philosophy
hfs -copyFromLocal articles /articles_that_led_to_philosophy
hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
-input /articles_that_led_to_philosophy -output /num_descendants \
-mapper 'count_descendants.py edges' -file count_descendants.py -file edges \
-reducer aggregate
hfs -cat /num_descendants/* | sort -k2 -t" " -nr > descendants.sorted
# draw graph of the top 1000
head -n 200 descendants.sorted > descendants.top200
./filter_nodes.py descendants.top200 < edges > filtered.edges.top200
./to_dot.py filtered.edges.top200 descendants.top200 | dot -Tpng > top200.png