This repository has been archived by the owner on Oct 8, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 153
E2006 tfidf regression dataset
Makoto YUI edited this page May 3, 2015
·
22 revisions
http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html#E2006-tfidf
cd /mnt/archive/datasets/regression/E2006-tfidf
awk -f conv.awk E2006.train > E2006.train.tsv
awk -f conv.awk E2006.test > E2006.test.tsv
hadoop fs -mkdir -p /dataset/E2006-tfidf/train
hadoop fs -mkdir -p /dataset/E2006-tfidf/test
hadoop fs -put E2006.train.tsv /dataset/E2006-tfidf/train
hadoop fs -put E2006.test.tsv /dataset/E2006-tfidf/test
create database E2006;
use E2006;
delete jar /home/myui/tmp/hivemall.jar;
add jar /home/myui/tmp/hivemall.jar;
source /home/myui/tmp/define-all.hive;
Create external table e2006tfidf_train (
rowid int,
target float,
features ARRAY<STRING>
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ","
STORED AS TEXTFILE LOCATION '/dataset/E2006-tfidf/train';
Create external table e2006tfidf_test (
rowid int,
target float,
features ARRAY<STRING>
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ","
STORED AS TEXTFILE LOCATION '/dataset/E2006-tfidf/test';
create table e2006tfidf_test_exploded as
select
rowid,
target,
split(feature,":")[0] as feature,
cast(split(feature,":")[1] as float) as value
-- hivemall v0.3.1 or later
-- extract_feature(feature) as feature,
-- extract_weight(feature) as value
from
e2006tfidf_test LATERAL VIEW explode(addBias(features)) t AS feature;
-- set mapred.reduce.tasks=32;
set hivevar:seed=31;
set hivevar:xtimes=3;
create or replace view e2006tfidf_train_x3 as
select * from (
select amplify(${xtimes}, *) as (rowid, target, features) from e2006tfidf_train
) t
CLUSTER BY rand(${seed});
-- set mapred.reduce.tasks=-1;