From 5bea5f3ec97ec06e8c40b2f0f625be7ccc36e8cd Mon Sep 17 00:00:00 2001 From: Harleen Mann Date: Sat, 4 Jan 2020 22:57:39 +1100 Subject: [PATCH 1/6] added modules for go --- go.mod | 5 +++++ go.sum | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 go.mod create mode 100644 go.sum diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..5248fb9 --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module github.com/go-gota/gota + +go 1.12 + +require gonum.org/v1/gonum v0.6.2 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..0494d16 --- /dev/null +++ b/go.sum @@ -0,0 +1,16 @@ +github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw= +github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= +github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= +github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= +golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= +golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= +gonum.org/v1/gonum v0.6.2 h1:4r+yNT0+8SWcOkXP+63H2zQbN+USnC73cjGUxnDF94Q= +gonum.org/v1/gonum v0.6.2/go.mod h1:9mxDZsDKxgMAuccQkewq682L+0eCu4dCN2yonUJTCLU= +gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= +gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= +rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= From 0893537d21d67285ea2f186d287b57ac174b6864 Mon Sep 17 00:00:00 2001 From: Harleen Mann Date: Sun, 5 Jan 2020 00:54:22 +1100 Subject: [PATCH 2/6] series insert implemented for intElements --- dataframe/dataframe.go | 26 +++++++++++++ series/series.go | 47 +++++++++++++++++++++++ series/series_test.go | 87 ++++++++++++++++++++++++++++++++++-------- 3 files changed, 144 insertions(+), 16 deletions(-) diff --git a/dataframe/dataframe.go b/dataframe/dataframe.go index cf1ae41..c41d481 100644 --- a/dataframe/dataframe.go +++ b/dataframe/dataframe.go @@ -427,6 +427,32 @@ func (df DataFrame) RBind(dfb DataFrame) DataFrame { return New(expandedSeries...) } +// TODO after series append has been modified +// Append will add new dataframe to an existing DataFrame at a given position. +// func (df DataFrame) Append(pos int, dfb DataFrame) DataFrame { +// if df.Err != nil { +// return df +// } +// if dfb.Err != nil { +// return dfb +// } +// if newvalues.Err != nil { +// return DataFrame{Err: fmt.Errorf("argument has errors: %v", newvalues.Err)} +// } +// if df.ncols != newvalues.ncols { +// return DataFrame{Err: fmt.Errorf("different number of columns")} +// } +// columns := make([]series.Series, df.ncols) +// for i, s := range df.columns { +// columns[i] = s.Set(indexes, newvalues.columns[i]) +// if columns[i].Err != nil { +// df = DataFrame{Err: fmt.Errorf("setting error on column %d: %v", i, columns[i].Err)} +// return df +// } +// } +// return df +// } + // Mutate changes a column of the DataFrame with the given Series or adds it as // a new column if the column name does not exist. func (df DataFrame) Mutate(s series.Series) DataFrame { diff --git a/series/series.go b/series/series.go index 79fe0a0..c8e0e81 100644 --- a/series/series.go +++ b/series/series.go @@ -254,6 +254,53 @@ func (s *Series) Append(values interface{}) { } } +// Insert adds new elements to the nth position of the Series provided by 'pos' parameter +// pos = 2 implies: +// 1. Insert after 2 elements of Series +// 2. Or 0th and 1st elements of the Series stay as is +// When using Append, the Series is modified in place. +func (s *Series) Insert(values interface{}, pos int) *Series { + if pos > s.elements.Len() { + s.Err = fmt.Errorf("pos (=%v) cannot be greater than length of the series (=%v)", pos, s.elements.Len()) + return s + } + if pos == -1 { + pos = s.elements.Len() + } + + if err := s.Err; err != nil { + return s + } + news := New(values, s.t, s.Name) + + switch s.t { + case String: + s.elements = append(append(s.elements.(stringElements)[:pos], news.elements.(stringElements)...), s.elements.(stringElements)[pos:]...) + case Int: + // the following won't work: + // part1 := s.elements.(intElements)[:pos] + // part2 := s.elements.(intElements)[pos:] + // hence, createing part1 and part2 as two new series elements + // + part1 := make(intElements, pos) + for i := 0; i < pos; i++ { + part1[i] = s.elements.(intElements)[i] + } + part2 := make(intElements, s.elements.Len()-pos) + j := 0 + for i := pos; i < s.elements.Len(); i++ { + part2[j] = s.elements.(intElements)[i] + j++ + } + s.elements = append(append(part1, news.elements.(intElements)...), part2...) + case Float: + s.elements = append(s.elements.(floatElements), news.elements.(floatElements)...) + case Bool: + s.elements = append(s.elements.(boolElements), news.elements.(boolElements)...) + } + return s +} + // Concat concatenates two series together. It will return a new Series with the // combined elements of both Series. func (s Series) Concat(x Series) Series { diff --git a/series/series_test.go b/series/series_test.go index c7d0516..e10e5ea 100644 --- a/series/series_test.go +++ b/series/series_test.go @@ -4,8 +4,8 @@ import ( "fmt" "math" "reflect" - "testing" "strings" + "testing" ) // Check that there are no shared memory addreses between the elements of two Series @@ -1525,9 +1525,8 @@ func TestSeries_Quantile(t *testing.T) { } } - func TestSeries_Map(t *testing.T) { - tests := []struct { + tests := []struct { series Series expected Series }{ @@ -1564,11 +1563,11 @@ func TestSeries_Map(t *testing.T) { doubleFloat64 := func(e Element) Element { var result Element result = e.Copy() - result.Set(result.Float() * 2) + result.Set(result.Float() * 2) return Element(result) } - // and two booleans + // and two booleans and := func(e Element) Element { var result Element result = e.Copy() @@ -1588,11 +1587,11 @@ func TestSeries_Map(t *testing.T) { i, err := result.Int() if err != nil { return Element(&intElement{ - e: +5, + e: +5, nan: false, }) } - result.Set(i + 5) + result.Set(i + 5) return Element(result) } @@ -1604,12 +1603,12 @@ func TestSeries_Map(t *testing.T) { return Element(result) } - for testnum, test := range tests { + for testnum, test := range tests { switch test.series.Type() { case Bool: expected := test.expected received := test.series.Map(and) - for i := 0 ; i len " + // }, + // { + // "!!! test pos == -1 " + // } + } + + for testnum, test := range tests { + test.series.Insert(test.value, test.pos) + + if fmt.Sprint(test.series) != test.expected { + t.Errorf("Test:%v failed. %v \n actual=%v", testnum, test.desc, test.series) + } + // if err := b.Err; err != nil { + // t.Errorf("Test:%v\nError:%v", testnum, err) + // } + // if err := checkTypes(b); err != nil { + // t.Errorf("Test:%v\nError:%v", testnum, err) + // } + //if err := checkAddr(a.Addr(), b.Addr()); err != nil { + //t.Errorf("Test:%v\nError:%v\nA:%v\nB:%v", testnum, err, a.Addr(), b.Addr()) + //} + } +} From acf6769c45ab9113a32fd6b642351dacae45019b Mon Sep 17 00:00:00 2001 From: Harleen Mann Date: Sun, 5 Jan 2020 21:13:25 +1100 Subject: [PATCH 3/6] series insert implemented. inc tests --- series/series.go | 38 +++++++++++++------------------------- series/series_test.go | 40 +++++++++++++++++++++++----------------- 2 files changed, 36 insertions(+), 42 deletions(-) diff --git a/series/series.go b/series/series.go index c8e0e81..2f03baa 100644 --- a/series/series.go +++ b/series/series.go @@ -255,50 +255,38 @@ func (s *Series) Append(values interface{}) { } // Insert adds new elements to the nth position of the Series provided by 'pos' parameter -// pos = 2 implies: +// e.g. pos = 2 implies: // 1. Insert after 2 elements of Series // 2. Or 0th and 1st elements of the Series stay as is -// When using Append, the Series is modified in place. -func (s *Series) Insert(values interface{}, pos int) *Series { +// When using Insert, the Series is modified in place. +func (s *Series) Insert(values interface{}, pos int) { if pos > s.elements.Len() { s.Err = fmt.Errorf("pos (=%v) cannot be greater than length of the series (=%v)", pos, s.elements.Len()) - return s + return } if pos == -1 { pos = s.elements.Len() } if err := s.Err; err != nil { - return s + return } news := New(values, s.t, s.Name) switch s.t { case String: - s.elements = append(append(s.elements.(stringElements)[:pos], news.elements.(stringElements)...), s.elements.(stringElements)[pos:]...) + // the following won't work in some cases: + // s.elements = append(append(s.elements.(stringElements)[:pos], news.elements.(stringElements)...), s.elements.(stringElements)[pos:]...) + // it may cause mutation of s.elements during inner append resulting in undesired output + s.elements = append(s.elements.(stringElements)[:pos], append(news.elements.(stringElements), s.elements.(stringElements)[pos:]...)...) case Int: - // the following won't work: - // part1 := s.elements.(intElements)[:pos] - // part2 := s.elements.(intElements)[pos:] - // hence, createing part1 and part2 as two new series elements - // - part1 := make(intElements, pos) - for i := 0; i < pos; i++ { - part1[i] = s.elements.(intElements)[i] - } - part2 := make(intElements, s.elements.Len()-pos) - j := 0 - for i := pos; i < s.elements.Len(); i++ { - part2[j] = s.elements.(intElements)[i] - j++ - } - s.elements = append(append(part1, news.elements.(intElements)...), part2...) + s.elements = append(s.elements.(intElements)[:pos], append(news.elements.(intElements), s.elements.(intElements)[pos:]...)...) case Float: - s.elements = append(s.elements.(floatElements), news.elements.(floatElements)...) + s.elements = append(s.elements.(floatElements)[:pos], append(news.elements.(floatElements), s.elements.(floatElements)[pos:]...)...) case Bool: - s.elements = append(s.elements.(boolElements), news.elements.(boolElements)...) + s.elements = append(s.elements.(boolElements)[:pos], append(news.elements.(boolElements), s.elements.(boolElements)[pos:]...)...) } - return s + return } // Concat concatenates two series together. It will return a new Series with the diff --git a/series/series_test.go b/series/series_test.go index e10e5ea..850f3fb 100644 --- a/series/series_test.go +++ b/series/series_test.go @@ -1692,28 +1692,34 @@ func TestSeries_Insert(t *testing.T) { 3, "[1 2 3 4 5 6 7]", }, - // { - // "!!! test pos > len " - // }, - // { - // "!!! test pos == -1 " - // } + { + "TestSeries_Insert:3: SeriesFloat.Insert([]Float) & pos=3 of Series i.e. after 3 elements of Series", + Floats([]float64{1.0, 2.0, 3.0, 6.0, 7.0}), + []float64{4, 5}, + 3, + "[1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000]", + }, + { + "TestSeries_Insert:4: SeriesBool.Insert([]Bool) & pos=-1", + Bools([]bool{true, true}), + []bool{false, false}, + -1, + "[true true false false]", + }, + { + "TestSeries_Insert_ERROR:5: SeriesBool.Insert([]Bool) & pos > length of series", + Bools([]bool{true, true}), + []bool{false, false}, + 3, + "pos (=3) cannot be greater than length of the series (=2)", + }, } for testnum, test := range tests { test.series.Insert(test.value, test.pos) - if fmt.Sprint(test.series) != test.expected { - t.Errorf("Test:%v failed. %v \n actual=%v", testnum, test.desc, test.series) + if fmt.Sprint(test.series) != test.expected && fmt.Sprint(test.series.Err) != test.expected { + t.Errorf("Test:%v failed. %v \n expected=%v \t actualValue=%v \t actualError=%v", testnum, test.desc, test.expected, test.series, test.series.Err) } - // if err := b.Err; err != nil { - // t.Errorf("Test:%v\nError:%v", testnum, err) - // } - // if err := checkTypes(b); err != nil { - // t.Errorf("Test:%v\nError:%v", testnum, err) - // } - //if err := checkAddr(a.Addr(), b.Addr()); err != nil { - //t.Errorf("Test:%v\nError:%v\nA:%v\nB:%v", testnum, err, a.Addr(), b.Addr()) - //} } } From c569f6ff27ece1d909ad7bb322960d67db50016b Mon Sep 17 00:00:00 2001 From: Harleen Mann Date: Sun, 5 Jan 2020 22:48:05 +1100 Subject: [PATCH 4/6] dataframe insert implemented. tests included --- dataframe/dataframe.go | 52 +++++++++++++------------- dataframe/dataframe_test.go | 74 +++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 25 deletions(-) diff --git a/dataframe/dataframe.go b/dataframe/dataframe.go index c41d481..b2f0d54 100644 --- a/dataframe/dataframe.go +++ b/dataframe/dataframe.go @@ -427,31 +427,33 @@ func (df DataFrame) RBind(dfb DataFrame) DataFrame { return New(expandedSeries...) } -// TODO after series append has been modified -// Append will add new dataframe to an existing DataFrame at a given position. -// func (df DataFrame) Append(pos int, dfb DataFrame) DataFrame { -// if df.Err != nil { -// return df -// } -// if dfb.Err != nil { -// return dfb -// } -// if newvalues.Err != nil { -// return DataFrame{Err: fmt.Errorf("argument has errors: %v", newvalues.Err)} -// } -// if df.ncols != newvalues.ncols { -// return DataFrame{Err: fmt.Errorf("different number of columns")} -// } -// columns := make([]series.Series, df.ncols) -// for i, s := range df.columns { -// columns[i] = s.Set(indexes, newvalues.columns[i]) -// if columns[i].Err != nil { -// df = DataFrame{Err: fmt.Errorf("setting error on column %d: %v", i, columns[i].Err)} -// return df -// } -// } -// return df -// } +// Insert will add new dataframe to an existing DataFrame at a given position. +func (df DataFrame) Insert(dfb DataFrame, pos int) DataFrame { + if df.Err != nil { + return df + } + if dfb.Err != nil { + return dfb + } + expandedSeries := make([]series.Series, df.ncols) + for k, v := range df.Names() { + idx := findInStringSlice(v, dfb.Names()) + if idx == -1 { + return DataFrame{Err: fmt.Errorf("insert: column names are not compatible")} + } + + originalSeries := df.columns[k] + addedSeries := dfb.columns[idx] + + originalSeries.Insert(addedSeries, pos) + if err := originalSeries.Err; err != nil { + return DataFrame{Err: fmt.Errorf("insert: %v", err)} + } + + expandedSeries[k] = originalSeries + } + return New(expandedSeries...) +} // Mutate changes a column of the DataFrame with the given Series or adds it as // a new column if the column name does not exist. diff --git a/dataframe/dataframe_test.go b/dataframe/dataframe_test.go index 114c0e4..dfe9d89 100644 --- a/dataframe/dataframe_test.go +++ b/dataframe/dataframe_test.go @@ -2,6 +2,7 @@ package dataframe import ( "bytes" + "fmt" "reflect" "strconv" "strings" @@ -2558,3 +2559,76 @@ func TestDescribe(t *testing.T) { } } } + +func TestDataFrame_Insert(t *testing.T) { + tests := []struct { + desc string + df DataFrame + value DataFrame + pos int + expected string + }{ + { + "TestDataFrame_Insert:0: DataframeString.Insert(DataframeString) & pos=end of Series", + LoadRecords( + [][]string{ + {"A", "C", "D"}, + {"1", "5.1", "true"}, + {"NaN", "6.0", "true"}, + {"2", "6.0", "false"}, + }, + ), + LoadRecords( + [][]string{ + {"A", "C", "D"}, + {"2", "7.1", "false"}, + }, + ), + -1, + `[4x3] DataFrame + + A C D + 0: 1 5.100000 true + 1: NaN 6.000000 true + 2: 2 6.000000 false + 3: 2 7.100000 false + +`, + }, + { + "TestDataFrame_Insert:1: DataFrameString.Insert(DataFrameString) & pos=0", + LoadRecords( + [][]string{ + {"A", "C", "D"}, + {"1", "5.1", "true"}, + {"NaN", "6.0", "true"}, + {"2", "6.0", "false"}, + }, + ), + LoadRecords( + [][]string{ + {"A", "C", "D"}, + {"2", "7.1", "false"}, + }, + ), + 0, + `[4x3] DataFrame + + A C D + 0: 2 7.100000 false + 1: 1 5.100000 true + 2: NaN 6.000000 true + 3: 2 6.000000 false + +`, + }, + } + + for testnum, test := range tests { + actual := test.df.Insert(test.value, test.pos) + + if fmt.Sprint(actual) != test.expected && fmt.Sprint(actual.Err) != test.expected { + t.Errorf("Test:%v failed. %v \n expected=\n%v \n actualValue=\n%v \n actualError=\n%v", testnum, test.desc, test.expected, actual, actual.Err) + } + } +} From 4a2ec4d6a685e5e222f257dcb5f11bdcc7db43d1 Mon Sep 17 00:00:00 2001 From: Harleen Mann <22973248+mannharleen@users.noreply.github.com> Date: Tue, 21 Jan 2020 16:24:34 +1100 Subject: [PATCH 5/6] changes to TestDataFrame and formatting as per review comments --- dataframe/dataframe_test.go | 57 ++++++++++++++++++++++--------------- go.mod | 5 ---- go.sum | 16 ----------- series/series_test.go | 29 ++++++++++--------- 4 files changed, 49 insertions(+), 58 deletions(-) delete mode 100644 go.mod delete mode 100644 go.sum diff --git a/dataframe/dataframe_test.go b/dataframe/dataframe_test.go index dfe9d89..aa9a792 100644 --- a/dataframe/dataframe_test.go +++ b/dataframe/dataframe_test.go @@ -2,7 +2,6 @@ package dataframe import ( "bytes" - "fmt" "reflect" "strconv" "strings" @@ -2566,7 +2565,7 @@ func TestDataFrame_Insert(t *testing.T) { df DataFrame value DataFrame pos int - expected string + expected DataFrame }{ { "TestDataFrame_Insert:0: DataframeString.Insert(DataframeString) & pos=end of Series", @@ -2585,15 +2584,15 @@ func TestDataFrame_Insert(t *testing.T) { }, ), -1, - `[4x3] DataFrame - - A C D - 0: 1 5.100000 true - 1: NaN 6.000000 true - 2: 2 6.000000 false - 3: 2 7.100000 false - -`, + LoadRecords( + [][]string{ + {"A", "C", "D"}, + {"1", "5.1", "true"}, + {"NaN", "6.0", "true"}, + {"2", "6.0", "false"}, + {"2", "7.1", "false"}, + }, + ), }, { "TestDataFrame_Insert:1: DataFrameString.Insert(DataFrameString) & pos=0", @@ -2612,23 +2611,35 @@ func TestDataFrame_Insert(t *testing.T) { }, ), 0, - `[4x3] DataFrame - - A C D - 0: 2 7.100000 false - 1: 1 5.100000 true - 2: NaN 6.000000 true - 3: 2 6.000000 false - -`, + LoadRecords( + [][]string{ + {"A", "C", "D"}, + {"2", "7.1", "false"}, + {"1", "5.1", "true"}, + {"NaN", "6.0", "true"}, + {"2", "6.0", "false"}, + }, + ), }, } - for testnum, test := range tests { + for i, test := range tests { actual := test.df.Insert(test.value, test.pos) - if fmt.Sprint(actual) != test.expected && fmt.Sprint(actual.Err) != test.expected { - t.Errorf("Test:%v failed. %v \n expected=\n%v \n actualValue=\n%v \n actualError=\n%v", testnum, test.desc, test.expected, actual, actual.Err) + if test.df.Err != nil { + t.Errorf("Test: %d\nError:%v", i, test.df.Err) + } + // Check that the types are the same between both DataFrames + if !reflect.DeepEqual(test.expected.Types(), actual.Types()) { + t.Errorf("Test: %d\nDifferent types:\nexpected:%v\nactual:%v", i, test.expected.Types(), actual.Types()) + } + // Check that the colnames are the same between both DataFrames + if !reflect.DeepEqual(test.expected.Names(), actual.Names()) { + t.Errorf("Test: %d\nDifferent colnames:\nexpected:%v\nactual:%v", i, test.expected.Names(), actual.Names()) + } + // Check that the values are the same between both DataFrames + if !reflect.DeepEqual(test.expected.Records(), actual.Records()) { + t.Errorf("Test: %d: Different values:\nexpected:%v\nactual:%v", i, test.expected, actual) } } } diff --git a/go.mod b/go.mod deleted file mode 100644 index 5248fb9..0000000 --- a/go.mod +++ /dev/null @@ -1,5 +0,0 @@ -module github.com/go-gota/gota - -go 1.12 - -require gonum.org/v1/gonum v0.6.2 diff --git a/go.sum b/go.sum deleted file mode 100644 index 0494d16..0000000 --- a/go.sum +++ /dev/null @@ -1,16 +0,0 @@ -github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw= -github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= -github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= -github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= -golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= -golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= -gonum.org/v1/gonum v0.6.2 h1:4r+yNT0+8SWcOkXP+63H2zQbN+USnC73cjGUxnDF94Q= -gonum.org/v1/gonum v0.6.2/go.mod h1:9mxDZsDKxgMAuccQkewq682L+0eCu4dCN2yonUJTCLU= -gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= -gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= -rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= diff --git a/series/series_test.go b/series/series_test.go index 850f3fb..862b7ee 100644 --- a/series/series_test.go +++ b/series/series_test.go @@ -4,8 +4,8 @@ import ( "fmt" "math" "reflect" - "strings" "testing" + "strings" ) // Check that there are no shared memory addreses between the elements of two Series @@ -1525,8 +1525,9 @@ func TestSeries_Quantile(t *testing.T) { } } + func TestSeries_Map(t *testing.T) { - tests := []struct { + tests := []struct { series Series expected Series }{ @@ -1563,11 +1564,11 @@ func TestSeries_Map(t *testing.T) { doubleFloat64 := func(e Element) Element { var result Element result = e.Copy() - result.Set(result.Float() * 2) + result.Set(result.Float() * 2) return Element(result) } - // and two booleans + // and two booleans and := func(e Element) Element { var result Element result = e.Copy() @@ -1587,11 +1588,11 @@ func TestSeries_Map(t *testing.T) { i, err := result.Int() if err != nil { return Element(&intElement{ - e: +5, + e: +5, nan: false, }) } - result.Set(i + 5) + result.Set(i + 5) return Element(result) } @@ -1603,12 +1604,12 @@ func TestSeries_Map(t *testing.T) { return Element(result) } - for testnum, test := range tests { + for testnum, test := range tests { switch test.series.Type() { case Bool: expected := test.expected received := test.series.Map(and) - for i := 0; i < expected.Len(); i++ { + for i := 0 ; i Date: Mon, 15 Jun 2020 10:37:12 +0200 Subject: [PATCH 6/6] Sync with dev branch and run gofmt on dataframe.go (to pass travis test) (#1) * Set up TravisCI * Make fixColnames faster (#112) Co-authored-by: Carl Kingsford * Add dataframe.Concat (#104) Concat concatenates rows of two dataframes like RBind, but also includes the unmatched columns. * Combining filters with AND and user-defined filters (#99) * added missing closing brace in series/series.go (function Map) * removed empty line at beginning of function series/series.go/Map * implemented user-defined comparator for series' * added and semantics for row filtering with multiple filters * refined README for filtering with OR, AND, and user-defined filters (CompFunc) * Run go mod tidy * Add extra checks for TravisCI * Use UseNumber() when parsing JSON to interface{} (#116) * Add ReadHTML to support loading HTML tables (#107) * Allow numeric column-index for filters (#106) Co-authored-by: Alex Sanchez Co-authored-by: gmarcais Co-authored-by: Carl Kingsford Co-authored-by: Mura Li Co-authored-by: Christoph Laaber Co-authored-by: gautamoncloud9 --- .travis.yml | 13 ++ README.md | 61 +++++++- dataframe/dataframe.go | 252 ++++++++++++++++++++++++++++-- dataframe/dataframe_test.go | 295 ++++++++++++++++++++++++++++++++++-- go.mod | 8 + go.sum | 23 +++ series/series.go | 39 +++-- series/series_test.go | 99 ++++++++++-- 8 files changed, 738 insertions(+), 52 deletions(-) create mode 100644 .travis.yml create mode 100644 go.mod create mode 100644 go.sum diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..06428d6 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,13 @@ +language: go +go: master +before_script: + - echo 'Checking code quality issues.' + - go vet ./... + - echo 'Checking that gofmt was used.' + - diff -u <(echo -n) <(gofmt -d .) + - echo 'Checking tidiness of go mod.' + - go mod tidy + - test -z "$(git status --porcelain)" +script: + - echo 'Running tests.' + - go test -v ./... diff --git a/README.md b/README.md index 5b8bf55..90e0ff0 100644 --- a/README.md +++ b/README.md @@ -182,14 +182,71 @@ column "B" is greater than 4: fil := df.Filter( dataframe.F{"A", series.Eq, "a"}, dataframe.F{"B", series.Greater, 4}, +) + +filAlt := df.FilterAggregation( + dataframe.Or, + dataframe.F{"A", series.Eq, "a"}, + dataframe.F{"B", series.Greater, 4}, ) +``` + +Filters inside Filter are combined as OR operations, alternatively we can use `df.FilterAggragation` with `dataframe.Or`. + +If we want to combine filters with AND operations, we can use `df.FilterAggregation` with `dataframe.And`. + +```go +fil := df.FilterAggregation( + dataframe.And, + dataframe.F{"A", series.Eq, "a"}, + dataframe.F{"D", series.Eq, true}, +) +``` + +To combine AND and OR operations, we can use chaining of filters. + +```go +// combine filters with OR +fil := df.Filter( + dataframe.F{"A", series.Eq, "a"}, + dataframe.F{"B", series.Greater, 4}, +) +// apply AND for fil and fil2 fil2 := fil.Filter( dataframe.F{"D", series.Eq, true}, ) ``` -Filters inside Filter are combined as OR operations whereas if we chain -Filter methods, they will behave as AND. +Filtering is based on predefined comparison operators: +* `series.Eq` +* `series.Neq` +* `series.Greater` +* `series.GreaterEq` +* `series.Less` +* `series.LessEq` +* `series.In` + +However, if these filter operations are not sufficient, we can use user-defined comparators. +We use `series.CompFunc` and a user-defined function with the signature `func(series.Element) bool` to provide user-defined filters to `df.Filter` and `df.FilterAggregation`. + +```go +hasPrefix := func(prefix string) func(el series.Element) bool { + return func (el series.Element) bool { + if el.Type() == String { + if val, ok := el.Val().(string); ok { + return strings.HasPrefix(val, prefix) + } + } + return false + } + } + +fil := df.Filter( + dataframe.F{"A", series.CompFunc, hasPrefix("aa")}, +) +``` + +This example filters rows based on whether they have a cell value starting with `"aa"` in column `"A"`. #### Arrange diff --git a/dataframe/dataframe.go b/dataframe/dataframe.go index b2f0d54..cc40ca2 100644 --- a/dataframe/dataframe.go +++ b/dataframe/dataframe.go @@ -14,6 +14,8 @@ import ( "unicode/utf8" "github.com/go-gota/gota/series" + "golang.org/x/net/html" + "golang.org/x/net/html/atom" ) // DataFrame is a data structure designed for operating on table like data (Such @@ -250,7 +252,7 @@ func (df DataFrame) print( // Subsetting, mutating and transforming DataFrame methods // ======================================================= -// Set will update the values of a DataFrame for all rows selected via indexes. +// Set will update the values of a DataFrame for the rows selected via indexes. func (df DataFrame) Set(indexes series.Indexes, newvalues DataFrame) DataFrame { if df.Err != nil { return df @@ -427,6 +429,54 @@ func (df DataFrame) RBind(dfb DataFrame) DataFrame { return New(expandedSeries...) } +// Concat concatenates rows of two DataFrames like RBind, but also including +// unmatched columns. +func (df DataFrame) Concat(dfb DataFrame) DataFrame { + if df.Err != nil { + return df + } + if dfb.Err != nil { + return dfb + } + + uniques := make(map[string]struct{}) + cols := []string{} + for _, t := range []DataFrame{df, dfb} { + for _, u := range t.Names() { + if _, ok := uniques[u]; !ok { + uniques[u] = struct{}{} + cols = append(cols, u) + } + } + } + + expandedSeries := make([]series.Series, len(cols)) + for k, v := range cols { + aidx := findInStringSlice(v, df.Names()) + bidx := findInStringSlice(v, dfb.Names()) + + // aidx and bidx must not be -1 at the same time. + var a, b series.Series + if aidx != -1 { + a = df.columns[aidx] + } else { + bb := dfb.columns[bidx] + a = series.New(make([]struct{}, df.nrows), bb.Type(), bb.Name) + } + if bidx != -1 { + b = dfb.columns[bidx] + } else { + b = series.New(make([]struct{}, dfb.nrows), a.Type(), a.Name) + } + newSeries := a.Concat(b) + if err := newSeries.Err; err != nil { + return DataFrame{Err: fmt.Errorf("concat: %v", err)} + } + expandedSeries[k] = newSeries + } + return New(expandedSeries...) +} + // Insert will add new dataframe to an existing DataFrame at a given position. func (df DataFrame) Insert(dfb DataFrame, pos int) DataFrame { if df.Err != nil { @@ -491,6 +541,7 @@ func (df DataFrame) Mutate(s series.Series) DataFrame { // F is the filtering structure type F struct { + Colidx int Colname string Comparator series.Comparator Comparando interface{} @@ -501,14 +552,47 @@ type F struct { // whereas if we chain Filter calls, every filter will act as an AND operation // with regards to the rest. func (df DataFrame) Filter(filters ...F) DataFrame { + return df.FilterAggregation(Or, filters...) +} + +// Aggregation defines the filter aggregation +type Aggregation int + +func (a Aggregation) String() string { + switch a { + case Or: + return "or" + case And: + return "and" + } + return fmt.Sprintf("unknown aggragation %d", a) +} + +const ( + // Or aggregates filters with logical or + Or Aggregation = iota + // And aggregates filters with logical and + And +) + +// FilterAggregation will filter the rows of a DataFrame based on the given filters. All +// filters on the argument of a Filter call are aggregated depending on the supplied +// aggregation. +func (df DataFrame) FilterAggregation(agg Aggregation, filters ...F) DataFrame { if df.Err != nil { return df } + compResults := make([]series.Series, len(filters)) for i, f := range filters { - idx := findInStringSlice(f.Colname, df.Names()) - if idx < 0 { - return DataFrame{Err: fmt.Errorf("filter: can't find column name")} + var idx int + if f.Colname == "" { + idx = f.Colidx + } else { + idx = findInStringSlice(f.Colname, df.Names()) + if idx < 0 { + return DataFrame{Err: fmt.Errorf("filter: can't find column name")} + } } res := df.columns[idx].Compare(f.Comparator, f.Comparando) if err := res.Err; err != nil { @@ -516,10 +600,11 @@ func (df DataFrame) Filter(filters ...F) DataFrame { } compResults[i] = res } - // Join compResults via "OR" + if len(compResults) == 0 { return df.Copy() } + res, err := compResults[0].Bool() if err != nil { return DataFrame{Err: fmt.Errorf("filter: %v", err)} @@ -530,7 +615,14 @@ func (df DataFrame) Filter(filters ...F) DataFrame { return DataFrame{Err: fmt.Errorf("filter: %v", err)} } for j := 0; j < len(res); j++ { - res[j] = res[j] || nextRes[j] + switch agg { + case Or: + res[j] = res[j] || nextRes[j] + case And: + res[j] = res[j] && nextRes[j] + default: + panic(agg) + } } } return df.Subset(res) @@ -1134,7 +1226,9 @@ func ReadCSV(r io.Reader, options ...LoadOption) DataFrame { // resulting records. func ReadJSON(r io.Reader, options ...LoadOption) DataFrame { var m []map[string]interface{} - err := json.NewDecoder(r).Decode(&m) + d := json.NewDecoder(r) + d.UseNumber() + err := d.Decode(&m) if err != nil { return DataFrame{Err: err} } @@ -1188,6 +1282,131 @@ func (df DataFrame) WriteJSON(w io.Writer) error { return json.NewEncoder(w).Encode(df.Maps()) } +// Internal state for implementing ReadHTML +type remainder struct { + index int + text string + nrows int +} + +func readRows(trs []*html.Node) [][]string { + rems := []remainder{} + rows := [][]string{} + for _, tr := range trs { + xrems := []remainder{} + row := []string{} + index := 0 + text := "" + for j, td := 0, tr.FirstChild; td != nil; j, td = j+1, td.NextSibling { + if td.Type == html.ElementNode && td.DataAtom == atom.Td { + + for len(rems) > 0 { + v := rems[0] + if v.index > index { + break + } + v, rems = rems[0], rems[1:] + row = append(row, v.text) + if v.nrows > 1 { + xrems = append(xrems, remainder{v.index, v.text, v.nrows - 1}) + } + index++ + } + + rowspan, colspan := 1, 1 + for _, attr := range td.Attr { + switch attr.Key { + case "rowspan": + if k, err := strconv.Atoi(attr.Val); err == nil { + rowspan = k + } + case "colspan": + if k, err := strconv.Atoi(attr.Val); err == nil { + colspan = k + } + } + } + for c := td.FirstChild; c != nil; c = c.NextSibling { + if c.Type == html.TextNode { + text = strings.TrimSpace(c.Data) + } + } + + for k := 0; k < colspan; k++ { + row = append(row, text) + if rowspan > 1 { + xrems = append(xrems, remainder{index, text, rowspan - 1}) + } + index++ + } + } + } + for j := 0; j < len(rems); j++ { + v := rems[j] + row = append(row, v.text) + if v.nrows > 1 { + xrems = append(xrems, remainder{v.index, v.text, v.nrows - 1}) + } + } + rows = append(rows, row) + rems = xrems + } + for len(rems) > 0 { + xrems := []remainder{} + row := []string{} + for i := 0; i < len(rems); i++ { + v := rems[i] + row = append(row, v.text) + if v.nrows > 1 { + xrems = append(xrems, remainder{v.index, v.text, v.nrows - 1}) + } + } + rows = append(rows, row) + rems = xrems + } + return rows +} + +func ReadHTML(r io.Reader, options ...LoadOption) []DataFrame { + var err error + var dfs []DataFrame + var doc *html.Node + var f func(*html.Node) + + doc, err = html.Parse(r) + if err != nil { + return []DataFrame{DataFrame{Err: err}} + } + + f = func(n *html.Node) { + if n.Type == html.ElementNode && n.DataAtom == atom.Table { + trs := []*html.Node{} + for c := n.FirstChild; c != nil; c = c.NextSibling { + if c.Type == html.ElementNode && c.DataAtom == atom.Tbody { + for cc := c.FirstChild; cc != nil; cc = cc.NextSibling { + if cc.Type == html.ElementNode && (cc.DataAtom == atom.Th || cc.DataAtom == atom.Tr) { + trs = append(trs, cc) + } + } + } + } + + df := LoadRecords(readRows(trs), options...) + if df.Err == nil { + dfs = append(dfs, df) + } + return + } + + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c) + } + } + + f(doc) + return dfs +} + // Getters/Setters for DataFrame fields // ==================================== @@ -1745,7 +1964,7 @@ func (df DataFrame) Elem(r, c int) series.Element { // fixColnames assigns a name to the missing column names and makes it so that the // column names are unique. func fixColnames(colnames []string) { - // Find duplicated colnames + // Find duplicated and missing colnames dupnamesidx := make(map[string][]int) var missingnames []int for i := 0; i < len(colnames); i++ { @@ -1754,16 +1973,17 @@ func fixColnames(colnames []string) { missingnames = append(missingnames, i) continue } - for j := 0; j < len(colnames); j++ { - b := colnames[j] - if i != j && a == b { - temp := dupnamesidx[a] - if !inIntSlice(i, temp) { - dupnamesidx[a] = append(temp, i) - } - } + // for now, dupnamesidx contains the indices of *all* the columns + // the columns with unique locations will be removed after this loop + dupnamesidx[a] = append(dupnamesidx[a], i) + } + // NOTE: deleting a map key in a range is legal and correct in Go. + for k, places := range dupnamesidx { + if len(places) < 2 { + delete(dupnamesidx, k) } } + // Now: dupnameidx contains only keys that appeared more than once // Autofill missing column names counter := 0 diff --git a/dataframe/dataframe_test.go b/dataframe/dataframe_test.go index aa9a792..a4738dd 100644 --- a/dataframe/dataframe_test.go +++ b/dataframe/dataframe_test.go @@ -553,6 +553,117 @@ func TestDataFrame_RBind(t *testing.T) { } } +func TestDataFrame_Concat(t *testing.T) { + type NA struct{} + + a := New( + series.New([]string{"b", "a", "b", "c", "d"}, series.String, "COL.1"), + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + ) + table := []struct { + dfa DataFrame + dfb DataFrame + expDf DataFrame + }{ + { + a, + New( + series.New([]string{"b", "a", "b", "c", "d"}, series.String, "COL.1"), + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + ), + New( + series.New([]string{"b", "a", "b", "c", "d", "b", "a", "b", "c", "d"}, series.String, "COL.1"), + series.New([]int{1, 2, 4, 5, 4, 1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2, 3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + ), + }, + { + a, + New( + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.1"), + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + ), + New( + series.New([]string{"b", "a", "b", "c", "d", "1", "2", "4", "5", "4"}, series.String, "COL.1"), + series.New([]int{1, 2, 4, 5, 4, 1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2, 3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + ), + }, + + { + a, + New( + series.New([]string{"b", "a", "b", "c", "d"}, series.String, "COL.1"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + ), + New( + series.New([]string{"b", "a", "b", "c", "d", "b", "a", "b", "c", "d"}, series.String, "COL.1"), + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.2").Concat(series.New([]NA{NA{}, NA{}, NA{}, NA{}, NA{}}, series.Int, "")), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2, 3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + ), + }, + { + a, + New( + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.1"), + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + series.New([]string{"a", "b", "c", "d", "e"}, series.String, "COL.4"), + ), + New( + series.New([]string{"b", "a", "b", "c", "d", "1", "2", "4", "5", "4"}, series.String, "COL.1"), + series.New([]int{1, 2, 4, 5, 4, 1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2, 3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + series.New([]NA{NA{}, NA{}, NA{}, NA{}, NA{}}, series.String, "COL.4").Concat(series.New([]string{"a", "b", "c", "d", "e"}, series.String, "COL.4")), + ), + }, + { + a, + New( + series.New([]string{"a", "b", "c", "d", "e"}, series.String, "COL.0"), + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.1"), + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + ), + New( + series.New([]string{"b", "a", "b", "c", "d", "1", "2", "4", "5", "4"}, series.String, "COL.1"), + series.New([]int{1, 2, 4, 5, 4, 1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2, 3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + series.New([]NA{NA{}, NA{}, NA{}, NA{}, NA{}}, series.String, "COL.0").Concat(series.New([]string{"a", "b", "c", "d", "e"}, series.String, "COL.0")), + ), + }, + { + DataFrame{}, + a, + a, + }, + } + for i, tc := range table { + b := tc.dfa.Concat(tc.dfb) + + if b.Err != nil { + t.Errorf("Test: %d\nError:%v", i, b.Err) + } + //if err := checkAddrDf(a, b); err != nil { + //t.Error(err) + //} + // Check that the types are the same between both DataFrames + if !reflect.DeepEqual(tc.expDf.Types(), b.Types()) { + t.Errorf("Test: %d\nDifferent types:\nA:%v\nB:%v", i, tc.expDf.Types(), b.Types()) + } + // Check that the colnames are the same between both DataFrames + if !reflect.DeepEqual(tc.expDf.Names(), b.Names()) { + t.Errorf("Test: %d\nDifferent colnames:\nA:%v\nB:%v", i, tc.expDf.Names(), b.Names()) + } + // Check that the values are the same between both DataFrames + if !reflect.DeepEqual(tc.expDf.Records(), b.Records()) { + t.Errorf("Test: %d\nDifferent values:\nA:%v\nB:%v", i, tc.expDf.Records(), b.Records()) + } + } +} func TestDataFrame_Records(t *testing.T) { a := New( series.New([]string{"a", "b", "c"}, series.String, "COL.1"), @@ -634,7 +745,7 @@ func TestDataFrame_Mutate(t *testing.T) { } } -func TestDataFrame_Filter(t *testing.T) { +func TestDataFrame_Filter_Or(t *testing.T) { a := New( series.New([]string{"b", "a", "b", "c", "d"}, series.String, "COL.1"), series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.2"), @@ -645,7 +756,7 @@ func TestDataFrame_Filter(t *testing.T) { expDf DataFrame }{ { - []F{{"COL.2", series.GreaterEq, 4}}, + []F{{0, "COL.2", series.GreaterEq, 4}}, New( series.New([]string{"b", "c", "d"}, series.String, "COL.1"), series.New([]int{4, 5, 4}, series.Int, "COL.2"), @@ -654,8 +765,8 @@ func TestDataFrame_Filter(t *testing.T) { }, { []F{ - {"COL.2", series.Greater, 4}, - {"COL.2", series.Eq, 1}, + {0, "COL.2", series.Greater, 4}, + {0, "COL.2", series.Eq, 1}, }, New( series.New([]string{"b", "c"}, series.String, "COL.1"), @@ -665,9 +776,21 @@ func TestDataFrame_Filter(t *testing.T) { }, { []F{ - {"COL.2", series.Greater, 4}, - {"COL.2", series.Eq, 1}, - {"COL.1", series.Eq, "d"}, + {0, "COL.2", series.Greater, 4}, + {0, "COL.2", series.Eq, 1}, + {0, "COL.1", series.Eq, "d"}, + }, + New( + series.New([]string{"b", "c", "d"}, series.String, "COL.1"), + series.New([]int{1, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 3.2, 1.2}, series.Float, "COL.3"), + ), + }, + { + []F{ + {1, "", series.Greater, 4}, + {1, "", series.Eq, 1}, + {0, "", series.Eq, "d"}, }, New( series.New([]string{"b", "c", "d"}, series.String, "COL.1"), @@ -697,6 +820,87 @@ func TestDataFrame_Filter(t *testing.T) { if !reflect.DeepEqual(tc.expDf.Records(), b.Records()) { t.Errorf("Test: %d\nDifferent values:\nA:%v\nB:%v", i, tc.expDf.Records(), b.Records()) } + + b2 := a.FilterAggregation(Or, tc.filters...) + + // Check that the types are the same between both DataFrames + if !reflect.DeepEqual(b.Types(), b2.Types()) { + t.Errorf("Test: %d\nDifferent types:\nB:%v\nB2:%v", i, b.Types(), b2.Types()) + } + // Check that the colnames are the same between both DataFrames + if !reflect.DeepEqual(b.Names(), b2.Names()) { + t.Errorf("Test: %d\nDifferent colnames:\nB:%v\nB2:%v", i, b.Names(), b2.Names()) + } + // Check that the values are the same between both DataFrames + if !reflect.DeepEqual(b.Records(), b2.Records()) { + t.Errorf("Test: %d\nDifferent values:\nB:%v\nB2:%v", i, b.Records(), b2.Records()) + } + } +} + +func TestDataFrame_Filter_And(t *testing.T) { + a := New( + series.New([]string{"b", "a", "b", "c", "d"}, series.String, "COL.1"), + series.New([]int{1, 2, 4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{3.0, 4.0, 5.3, 3.2, 1.2}, series.Float, "COL.3"), + ) + table := []struct { + filters []F + expDf DataFrame + }{ + { + []F{{Colname: "COL.2", Comparator: series.GreaterEq, Comparando: 4}}, + New( + series.New([]string{"b", "c", "d"}, series.String, "COL.1"), + series.New([]int{4, 5, 4}, series.Int, "COL.2"), + series.New([]float64{5.3, 3.2, 1.2}, series.Float, "COL.3"), + ), + }, + // should not have any rows + { + []F{ + {Colname: "COL.2", Comparator: series.Greater, Comparando: 4}, + {Colname: "COL.2", Comparator: series.Eq, Comparando: 1}, + }, + New( + series.New([]string{}, series.String, "COL.1"), + series.New([]int{}, series.Int, "COL.2"), + series.New([]float64{}, series.Float, "COL.3"), + ), + }, + { + []F{ + {Colname: "COL.2", Comparator: series.Less, Comparando: 4}, + {Colname: "COL.1", Comparator: series.Eq, Comparando: "b"}, + }, + New( + series.New([]string{"b"}, series.String, "COL.1"), + series.New([]int{1}, series.Int, "COL.2"), + series.New([]float64{3.0}, series.Float, "COL.3"), + ), + }, + } + for i, tc := range table { + b := a.FilterAggregation(And, tc.filters...) + + if b.Err != nil { + t.Errorf("Test: %d\nError:%v", i, b.Err) + } + //if err := checkAddrDf(a, b); err != nil { + //t.Error(err) + //} + // Check that the types are the same between both DataFrames + if !reflect.DeepEqual(tc.expDf.Types(), b.Types()) { + t.Errorf("Test: %d\nDifferent types:\nA:%v\nB:%v", i, tc.expDf.Types(), b.Types()) + } + // Check that the colnames are the same between both DataFrames + if !reflect.DeepEqual(tc.expDf.Names(), b.Names()) { + t.Errorf("Test: %d\nDifferent colnames:\nA:%v\nB:%v", i, tc.expDf.Names(), b.Names()) + } + // Check that the values are the same between both DataFrames + if !reflect.DeepEqual(tc.expDf.Records(), b.Records()) { + t.Errorf("Test: %d\nDifferent values:\nA:%v\nB:%v", i, tc.expDf.Records(), b.Records()) + } } } @@ -1191,13 +1395,13 @@ func TestReadJSON(t *testing.T) { expDf DataFrame }{ { - `[{"COL.1":null,"COL.2":1,"COL.3":3},{"COL.1":5,"COL.2":2,"COL.3":2},{"COL.1":6,"COL.2":3,"COL.3":1}]`, + `[{"COL.1":null,"COL.2":1,"COL.3":3},{"COL.1":5,"COL.2":2,"COL.3":2},{"COL.1":6,"COL.2":3,"COL.3":20180428}]`, LoadRecords( [][]string{ {"COL.1", "COL.2", "COL.3"}, {"NaN", "1", "3"}, {"5", "2", "2"}, - {"6", "3", "1"}, + {"6", "3", "20180428"}, }, DetectTypes(false), DefaultType(series.Int), @@ -1238,6 +1442,79 @@ func TestReadJSON(t *testing.T) { } } +func TestReadHTML(t *testing.T) { + table := []struct { + htmlStr string + expDf []DataFrame + }{ + { + "", + []DataFrame{}, + }, + { + ` + + + + +
COL.1
100
+ + `, + []DataFrame{ + LoadRecords( + [][]string{ + {"COL.1"}, + {"100"}, + }), + }, + }, + { + ` + + + + +
COL.1COL.2COL.3
100
+ + `, + []DataFrame{ + LoadRecords( + [][]string{ + {"COL.1", "COL.2", "COL.3"}, + {"COL.1", "COL.2", "100"}, + }), + }, + }, + } + + for i, tc := range table { + cs := ReadHTML(strings.NewReader(tc.htmlStr)) + if tc.htmlStr != "" && len(cs) == 0 { + t.Errorf("Test: %d, got zero dataframes: %#v", i, cs) + } + for j, c := range cs { + if len(cs) != len(tc.expDf) { + t.Errorf("Test: %d\n got len(%d), want len(%d)", i, len(cs), len(tc.expDf)) + } + if c.Err != nil { + t.Errorf("Test: %d\nError:%v", i, c.Err) + } + // Check that the types are the same between both DataFrames + if !reflect.DeepEqual(tc.expDf[j].Types(), c.Types()) { + t.Errorf("Test: %d\nDifferent types:\nA:%v\nB:%v", i, tc.expDf[j].Types(), c.Types()) + } + // Check that the colnames are the same between both DataFrames + if !reflect.DeepEqual(tc.expDf[j].Names(), c.Names()) { + t.Errorf("Test: %d\nDifferent colnames:\nA:%v\nB:%v", i, tc.expDf[j].Names(), c.Names()) + } + // Check that the values are the same between both DataFrames + if !reflect.DeepEqual(tc.expDf[j].Records(), c.Records()) { + t.Errorf("Test: %d\nDifferent values:\nA:%v\nB:%v", i, tc.expDf[j].Records(), c.Records()) + } + } + } +} + func TestDataFrame_SetNames(t *testing.T) { a := New( series.New([]string{"a", "b", "c"}, series.String, "COL.1"), diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..4fa2d10 --- /dev/null +++ b/go.mod @@ -0,0 +1,8 @@ +module github.com/go-gota/gota + +go 1.12 + +require ( + golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa + gonum.org/v1/gonum v0.0.0-20190926113837-94b2bbd8ac13 +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..6e994a0 --- /dev/null +++ b/go.sum @@ -0,0 +1,23 @@ +github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw= +github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= +github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= +github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2 h1:y102fOLFqhV41b+4GPiJoa0k/x+pJcEi2/HB1Y5T6fU= +golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= +golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa h1:F+8P+gmewFQYRk6JoLQLwjBCTu3mcIURZfNkVweuRKA= +golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= +gonum.org/v1/gonum v0.0.0-20190926113837-94b2bbd8ac13 h1:+wY2+nL3JJvpg6nJcpCNtA5h21osquBlKTl8g0hIlps= +gonum.org/v1/gonum v0.0.0-20190926113837-94b2bbd8ac13/go.mod h1:9mxDZsDKxgMAuccQkewq682L+0eCu4dCN2yonUJTCLU= +gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc= +gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= +gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= +rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= diff --git a/series/series.go b/series/series.go index 2f03baa..1e444b9 100644 --- a/series/series.go +++ b/series/series.go @@ -93,15 +93,19 @@ type Comparator string // Supported Comparators const ( - Eq Comparator = "==" // Equal - Neq Comparator = "!=" // Non equal - Greater Comparator = ">" // Greater than - GreaterEq Comparator = ">=" // Greater or equal than - Less Comparator = "<" // Lesser than - LessEq Comparator = "<=" // Lesser or equal than - In Comparator = "in" // Inside + Eq Comparator = "==" // Equal + Neq Comparator = "!=" // Non equal + Greater Comparator = ">" // Greater than + GreaterEq Comparator = ">=" // Greater or equal than + Less Comparator = "<" // Lesser than + LessEq Comparator = "<=" // Lesser or equal than + In Comparator = "in" // Inside + CompFunc Comparator = "func" // user-defined comparison function ) +// compFunc defines a user-defined comparator function. Used internally for type assertions +type compFunc = func(el Element) bool + // Type is a convenience alias that can be used for a more type safe way of // reason and use Series types. type Type string @@ -425,9 +429,25 @@ func (s Series) Compare(comparator Comparator, comparando interface{}) Series { return ret, nil } - comp := New(comparando, s.t, "") bools := make([]bool, s.Len()) - // In comparator comparation + + // CompFunc comparator comparison + if comparator == CompFunc { + f, ok := comparando.(compFunc) + if !ok { + panic("comparando is not a comparison function of type func(el Element) bool") + } + + for i := 0; i < s.Len(); i++ { + e := s.elements.Elem(i) + bools[i] = f(e) + } + + return Bools(bools) + } + + comp := New(comparando, s.t, "") + // In comparator comparison if comparator == In { for i := 0; i < s.Len(); i++ { e := s.elements.Elem(i) @@ -812,7 +832,6 @@ func (s Series) Quantile(p float64) float64 { // the function passed in via argument `f` will not expect another type, but // instead expects to handle Element(s) of type Float. func (s Series) Map(f MapFunction) Series { - mappedValues := make([]Element, s.Len()) for i := 0; i < s.Len(); i++ { value := f(s.elements.Elem(i)) diff --git a/series/series_test.go b/series/series_test.go index 862b7ee..588144b 100644 --- a/series/series_test.go +++ b/series/series_test.go @@ -4,8 +4,8 @@ import ( "fmt" "math" "reflect" - "testing" "strings" + "testing" ) // Check that there are no shared memory addreses between the elements of two Series @@ -421,6 +421,76 @@ func TestSeries_Compare(t *testing.T) { } } +func TestSeries_Compare_CompFunc(t *testing.T) { + table := []struct { + series Series + comparator Comparator + comparando interface{} + expected Series + panic bool + }{ + { + Strings([]string{"A", "B", "C", "B", "D", "BADA"}), + CompFunc, + func(el Element) bool { + if el.Type() == String { + if val, ok := el.Val().(string); ok { + return strings.HasPrefix(val, "B") + } + return false + } + return false + }, + Bools([]bool{false, true, false, true, false, true}), + false, + }, + { + Strings([]string{"A", "B", "C", "B", "D", "BADA"}), + CompFunc, + func(el Element) {}, + Bools([]bool{false, false, false, false, false}), + true, + }, + } + for testnum, test := range table { + func() { + defer func() { + if r := recover(); r != nil { + // recovered + if !test.panic { + t.Errorf("did not expected panic but was '%v'", r) + } + } else { + // nothing to recover from + if test.panic { + t.Errorf("exptected panic but did not panic") + } + } + }() + + a := test.series + b := a.Compare(test.comparator, test.comparando) + if err := b.Err; err != nil { + t.Errorf("Test:%v\nError:%v", testnum, err) + } + expected := test.expected.Records() + received := b.Records() + if !reflect.DeepEqual(expected, received) { + t.Errorf( + "Test:%v\nExpected:\n%v\nReceived:\n%v", + testnum, expected, received, + ) + } + if err := checkTypes(b); err != nil { + t.Errorf( + "Test:%v\nError:%v", + testnum, err, + ) + } + }() + } +} + func TestSeries_Subset(t *testing.T) { table := []struct { series Series @@ -1525,9 +1595,8 @@ func TestSeries_Quantile(t *testing.T) { } } - func TestSeries_Map(t *testing.T) { - tests := []struct { + tests := []struct { series Series expected Series }{ @@ -1564,11 +1633,11 @@ func TestSeries_Map(t *testing.T) { doubleFloat64 := func(e Element) Element { var result Element result = e.Copy() - result.Set(result.Float() * 2) + result.Set(result.Float() * 2) return Element(result) } - // and two booleans + // and two booleans and := func(e Element) Element { var result Element result = e.Copy() @@ -1588,11 +1657,11 @@ func TestSeries_Map(t *testing.T) { i, err := result.Int() if err != nil { return Element(&intElement{ - e: +5, + e: +5, nan: false, }) } - result.Set(i + 5) + result.Set(i + 5) return Element(result) } @@ -1604,12 +1673,12 @@ func TestSeries_Map(t *testing.T) { return Element(result) } - for testnum, test := range tests { + for testnum, test := range tests { switch test.series.Type() { case Bool: expected := test.expected received := test.series.Map(and) - for i := 0 ; i