This repository has been archived by the owner on Oct 8, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 153
KDD2010a binary classification dataset
myui edited this page Oct 9, 2014
·
7 revisions
http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#kdd2010 (algebra)
add jar ./tmp/hivemall.jar;
source ./tmp/define-all.hive;
create database kdd2010;
use kdd2010;
create external table kdd10a_train (
rowid int,
label int,
features ARRAY<STRING>
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ","
STORED AS TEXTFILE LOCATION '/dataset/kdd10a/train';
create external table kdd10a_test (
rowid int,
label int,
features ARRAY<STRING>
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ","
STORED AS TEXTFILE LOCATION '/dataset/kdd10a/test';
awk -f conv.awk kdda | hadoop fs -put - /dataset/kdd10a/train/kdda
awk -f conv.awk kdda.t | hadoop fs -put - /dataset/kdd10a/test/kdda.t
create table kdd10a_train_orcfile (
rowid bigint,
label int,
features array<string>
) STORED AS orc tblproperties ("orc.compress"="SNAPPY");
-- SET mapred.reduce.tasks=64;
INSERT OVERWRITE TABLE kdd10a_train_orcfile
select * from kdd10a_train
CLUSTER BY rand();
-- SET mapred.reduce.tasks=-1;
create table kdd10a_test_exploded as
select
rowid,
label,
split(feature,":")[0] as feature,
cast(split(feature,":")[1] as float) as value
from
kdd10a_test LATERAL VIEW explode(addBias(features)) t AS feature;
set hivevar:xtimes=3;
set hivevar:shufflebuffersize=1000;
-- set hivemall.amplify.seed=32;
create or replace view kdd10a_train_x3
as
select
rand_amplify(${xtimes}, ${shufflebuffersize}, *) as (rowid, label, features)
from
kdd10a_train_orcfile;