This repository has been archived by the owner on Oct 8, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 153
webspam dataset
Makoto YUI edited this page Aug 9, 2014
·
8 revisions
Get the dataset from http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#webspam
hadoop fs -mkdir -p /dataset/webspam/raw
awk -f conv.awk webspam_wc_normalized_trigram.svm | \
hadoop fs -put - /dataset/webspam/raw/
create database webspam;
use webspam;
delete jar ./tmp/hivemall.jar;
add jar ./tmp/hivemall.jar;
source ./tmp/define-all.hive;
create external table webspam_raw (
rowid int,
label int,
features ARRAY<STRING>
) ROW FORMAT
DELIMITED FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY ","
STORED AS TEXTFILE LOCATION '/dataset/webspam/raw';
set hive.sample.seednumber=43;
create table webspam_test
as
select * from webspam_raw TABLESAMPLE(1000 ROWS) s
CLUSTER BY rand(43)
limit 70000;
create table webspam_train_orcfile (
rowid int,
label int,
features array<string>
) STORED AS orc tblproperties ("orc.compress"="SNAPPY");
-- SET mapred.reduce.tasks=128;
INSERT OVERWRITE TABLE webspam_train_orcfile
select
s.rowid,
label,
addBias(features) as features
from webspam_raw s
where not exists (select rowid from webspam_test t where s.rowid = t.rowid)
CLUSTER BY rand(43);
-- SET mapred.reduce.tasks=-1;
set hivevar:xtimes=3;
set hivevar:shufflebuffersize=100;
set hivemall.amplify.seed=32;
create or replace view webspam_train_x3
as
select
rand_amplify(${xtimes}, ${shufflebuffersize}, *) as (rowid, label, features)
from
webspam_train_orcfile;
create table webspam_test_exploded as
select
rowid,
label,
split(feature,":")[0] as feature,
cast(split(feature,":")[1] as float) as value
from
webspam_test LATERAL VIEW explode(addBias(features)) t AS feature;
Caution: For this dataset, use small shufflebuffersize because each training example has lots of features though (xtimes * shufflebuffersize * N) training examples are cached in memory.