README


see http://matpalm.com/blog/2011/08/13/wikipedia-philosophy

-----

# get data
cd data
wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2   # 9gb

# flatten to single line
bzcat enwiki-latest-pages-articles.xml.bz2 | ~/flatten_to_one_page_per_line.py > enwiki-latest-pages-articles.pageperline.xml # 30gb

# split into redirects and articles
cat enwiki-latest-pages-articles.pageperline.xml | grep \<redirect\ \/\> > enwiki-latest-pages-redirects.xml &   
cat enwiki-latest-pages-articles.pageperline.xml | grep -v \<redirect\ \/\> > enwiki-latestXS-pages-articles.xml & 
wait

# move xml for articles and redirects into hdfs 
hadoop fs -mkdir /full/articles.xml
hadoop fs -copyFromLocal enwiki-latest-pages-articles.xml /full/articles.xml
hadoop fs -mkdir /full/redirects.xml
hadoop fs -copyFromLocal enwiki-latest-pages-redirects.xml /full/redirects.xml

# parse redirects
cd
hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
 -input /full/redirects.xml -output /full/redirects \
 -mapper redirect_parser.py -file redirect_parser.py

# dereference all redirects to their final targets
pig -p INPUT=/full/redirects -p OUTPUT=/full/redirects.dereferenced1 -f dereference_redirects.pig
pig -p INPUT=/full/redirects.dereferenced1 -p OUTPUT=/full/redirects.dereferenced2 -f dereference_redirects.pig
pig -p INPUT=/full/redirects.dereferenced2 -p OUTPUT=/full/redirects.dereferenced3 -f dereference_redirects.pig
pig -p INPUT=/full/redirects.dereferenced3 -p OUTPUT=/full/redirects.dereferenced4 -f dereference_redirects.pig
hfs -mv /full/redirects /full/redirects.original
hfs -mv /full/redirects.dereferenced4 /full/redirects

# run extraction
hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
 -input /full/articles.xml -output /full/edges \
 -mapper article_parser.py -file article_parser.py

# run redirects against edges
pig -p INPUT=/full/edges -p OUTPUT=/full/edges.dereferenced -f dereference_redirects.pig

# as a sanity check, should be same
# pig -p INPUT=/full/edges.dereferenced -p OUTPUT=/full/edges.dereferenced.sanity -f dereference_redirects.pig 

# get to local filesystem
hadoop fs -cat /full/edges.dereferenced/* > edges
# hadoop fs -cat /full/edges.dereferenced.sanity/* > edges.sanity

# add special cases, the parser will, alas, never be perfect...
cat manually_derived_edges >> edges

# calculate distance from Philosophy
java -Xmx8g -cp distanceToPhilosophy/bin/ DistanceToPhilosophy \
 Philosophy edges \
 >DistanceToPhilosophy.stdout 2>DistanceToPhilosophy.stderr

# order nodes by their number of descendants
grep ^FINAL DistanceToPhilosophy.stdout | sed -es/FINAL\ // > distances
cut -f1 distances > articles
hfs -mkdir /articles_that_led_to_philosophy
hfs -copyFromLocal articles /articles_that_led_to_philosophy
hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
 -input /articles_that_led_to_philosophy -output /num_descendants \
 -mapper 'count_descendants.py edges' -file count_descendants.py -file edges \
 -reducer aggregate
hfs -cat /num_descendants/* | sort -k2 -t"    " -nr > descendants.sorted

# draw graph of the top 1000
head -n 200 descendants.sorted > descendants.top200
./filter_nodes.py descendants.top200 < edges > filtered.edges.top200
./to_dot.py filtered.edges.top200 descendants.top200 | dot -Tpng > top200.png