snowplow · colmsnowplow · Jun 16, 2021 · Jun 23, 2021 · Jun 23, 2021 · Jun 28, 2021
diff --git a/cmd/config.go b/cmd/config.go
@@ -389,9 +389,11 @@ func (c *Config) GetTransformations() (transform.TransformationApplyFunction, er
 			funcs = append(funcs, transform.SpEnrichedToJson)
 		case "spEnrichedSetPk":
 			funcs = append(funcs, transform.NewSpEnrichedSetPkFunction(funcOpts[1]))
+		case "spEnrichedFilter":
+			funcs = append(funcs, transform.NewSpEnrichedFilterFunction(funcOpts[1]))
 		case "none":
 		default:
-			return nil, errors.New(fmt.Sprintf("Invalid transformation found; expected one of 'spEnrichedToJson', 'spEnrichedSetPk:{option}' and got '%s'", c.Transformation))
+			return nil, errors.New(fmt.Sprintf("Invalid transformation found; expected one of 'spEnrichedToJson', 'spEnrichedSetPk:{option}', spEnrichedFilter:{option} and got '%s'", c.Transformation))
 		}
 	}
 	return transform.NewTransformation(funcs...), nil

diff --git a/cmd/config_test.go b/cmd/config_test.go
@@ -109,7 +109,7 @@ func TestNewConfig_InvalidTransformation(t *testing.T) {
 	transformation, err := c.GetTransformations()
 	assert.Nil(transformation)
 	assert.NotNil(err)
-	assert.Equal("Invalid transformation found; expected one of 'spEnrichedToJson', 'spEnrichedSetPk:{option}' and got 'fake'", err.Error())
+	assert.Equal("Invalid transformation found; expected one of 'spEnrichedToJson', 'spEnrichedSetPk:{option}', spEnrichedFilter:{option} and got 'fake'", err.Error())
 }
 
 func TestNewConfig_InvalidTarget(t *testing.T) {

diff --git a/pkg/transform/snowplow_enriched_filter.go b/pkg/transform/snowplow_enriched_filter.go
@@ -0,0 +1,80 @@
+// PROPRIETARY AND CONFIDENTIAL
+//
+// Unauthorized copying of this file via any medium is strictly prohibited.
+//
+// Copyright (c) 2020-2021 Snowplow Analytics Ltd. All rights reserved.
+
+package transform
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/snowplow-devops/stream-replicator/pkg/models"
+	"github.com/snowplow/snowplow-golang-analytics-sdk/analytics"
+)
+
+//
+
+// NewSpEnrichedFilter returns a TransformationFunction which filters messages based on a field in the Snowplow enriched event.
+// The filterconfig should describe the conditions for including a message.
+// For example "aid=abc|def" includes all events with app IDs of abc or def, and filters out the rest.
+// aid!=abc|def includes all events whose app IDs do not match abc or def, and filters out the rest.
+func NewSpEnrichedFilterFunction(filterConfig string) TransformationFunction {
+	return func(message *models.Message, intermediateState interface{}) (*models.Message, *models.Message, interface{}) {
+
+		// Check for a negation condition first
+		keyValues := strings.SplitN(filterConfig, "!=", 2)
+
+		var keepMessage bool
+		if len(keyValues) > 1 {
+			// If negation condition is found, default to keep the message, and change this when match found
+			keepMessage = true
+		} else {
+			// Otherwise, look for affirmation condition, default to drop the message and change when match found
+			keyValues = strings.SplitN(filterConfig, "==", 2)
+			keepMessage = false
+		}
+		// TODO: Design - Should there be validation of the input here, or perhaps in the config? Or at all?
+
+		// Todo: make this its own function and DRY across all the transformations?
+		var parsedMessage, ok = intermediateState.(analytics.ParsedEvent)
+		var parseErr error
+		if ok {
+			parsedMessage = intermediateState.(analytics.ParsedEvent)
+		} else {
+			parsedMessage, parseErr = analytics.ParseEvent(string(message.Data))
+			if parseErr != nil {
+				message.SetError(parseErr)
+				return nil, message, nil
+			}
+			intermediateState = parsedMessage
+		}
+
+		valueFound, err := parsedMessage.GetValue(keyValues[0])
+		if err != nil {
+			message.SetError(err)
+			return nil, message, nil
+		}
+
+	evaluation:
+		for _, valueToMatch := range strings.Split(keyValues[1], "|") {
+			if valueToMatch == fmt.Sprintf("%v", valueFound) { // coerce to string as valueFound may be any type found in a Snowplow event
+				keepMessage = !keepMessage
+				break evaluation
+				// Once config value is matched once, change keepMessage then break out of the loop to avoid reverting back when we have two matches
+			}
+		}
+
+		// If message is not to be kept, ack it and return a nil result.
+		if !keepMessage {
+			if message.AckFunc != nil {
+				message.AckFunc()
+			}
+			return nil, nil, nil
+		}
+
+		// Otherwise, return the message and intermediateState for further processing.
+		return message, nil, intermediateState
+	}
+}
diff --git a/pkg/transform/snowplow_enriched_filter_test.go b/pkg/transform/snowplow_enriched_filter_test.go
@@ -0,0 +1,155 @@
+package transform
+
+import (
+	"testing"
+
+	"github.com/snowplow-devops/stream-replicator/pkg/models"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestNewSpEnrichedFilterFunction(t *testing.T) {
+	assert := assert.New(t)
+
+	var messageGood = models.Message{
+		Data:         snowplowTsv3,
+		PartitionKey: "some-key",
+	}
+
+	// Single value cases
+	aidFilterFuncKeep := NewSpEnrichedFilterFunction("app_id==test-data3")
+
+	aidFilteredIn, fail, _ := aidFilterFuncKeep(&messageGood, nil)
+
+	assert.Equal(snowplowTsv3, aidFilteredIn.Data)
+	assert.Nil(fail)
+
+	aidFilterFuncDiscard := NewSpEnrichedFilterFunction("app_id==failThis")
+
+	aidFilteredOut, fail2, _ := aidFilterFuncDiscard(&messageGood, nil)
+
+	assert.Nil(aidFilteredOut)
+	assert.Nil(fail2)
+
+	// int value
+	urlPrtFilterFuncKeep := NewSpEnrichedFilterFunction("page_urlport==80")
+
+	urlPrtFilteredIn, fail, _ := urlPrtFilterFuncKeep(&messageGood, nil)
+
+	assert.Equal(snowplowTsv3, urlPrtFilteredIn.Data)
+	assert.Nil(fail)
+
+	// Multiple value cases
+	aidFilterFuncKeepWithMultiple := NewSpEnrichedFilterFunction("app_id==someotherValue|test-data3")
+
+	aidFilteredKeptWithMultiple, fail3, _ := aidFilterFuncKeepWithMultiple(&messageGood, nil)
+
+	assert.Equal(snowplowTsv3, aidFilteredKeptWithMultiple.Data)
+	assert.Nil(fail3)
+
+	aidFilterFuncDiscardWithMultiple := NewSpEnrichedFilterFunction("app_id==someotherValue|failThis")
+
+	aidFilteredDiscardedWithMultiple, fail3, _ := aidFilterFuncDiscardWithMultiple(&messageGood, nil)
+
+	assert.Nil(aidFilteredDiscardedWithMultiple)
+	assert.Nil(fail3)
+
+	// Single value negation cases
+
+	aidFilterFuncNegationDiscard := NewSpEnrichedFilterFunction("app_id!=test-data3")
+
+	aidFilteredOutNegated, fail4, _ := aidFilterFuncNegationDiscard(&messageGood, nil)
+
+	assert.Nil(aidFilteredOutNegated)
+	assert.Nil(fail4)
+
+	aidFilterFuncNegationKeep := NewSpEnrichedFilterFunction("app_id!=failThis")
+
+	aidFilteredInNegated, fail5, _ := aidFilterFuncNegationKeep(&messageGood, nil)
+
+	assert.Equal(snowplowTsv3, aidFilteredInNegated.Data)
+	assert.Nil(fail5)
+
+	// Multiple value negation cases
+	aidFilterFuncNegationDiscardMultiple := NewSpEnrichedFilterFunction("app_id!=someotherValue|test-data1|test-data2|test-data3")
+
+	aidFilteredDiscardedWithMultiple, fail6, _ := aidFilterFuncNegationDiscardMultiple(&messageGood, nil)
+
+	assert.Nil(aidFilteredDiscardedWithMultiple)
+	assert.Nil(fail6)
+
+	aidFilterFuncNegationKeptMultiple := NewSpEnrichedFilterFunction("app_id!=someotherValue|failThis")
+
+	aidFilteredKeptWithMultiple, fail7, _ := aidFilterFuncNegationKeptMultiple(&messageGood, nil)
+
+	assert.Equal(snowplowTsv3, aidFilteredKeptWithMultiple.Data)
+	assert.Nil(fail7)
+}
+
+func TestSpEnrichedFilterFunction_Slice(t *testing.T) {
+	assert := assert.New(t)
+
+	var expectedFilter1 = []*models.Message{
+		{
+			Data:         snowplowTsv1,
+			PartitionKey: "some-key",
+		},
+	}
+
+	filter1 := NewTransformation(NewSpEnrichedFilterFunction("app_id==test-data1"))
+	filter1Res := filter1(messages)
+
+	assert.Equal(len(expectedFilter1), len(filter1Res.Result))
+	assert.Equal(1, len(filter1Res.Invalid))
+
+	var expectedFilter2 = []*models.Message{
+		{
+			Data:         snowplowTsv1,
+			PartitionKey: "some-key",
+		},
+		{
+			Data:         snowplowTsv2,
+			PartitionKey: "some-key1",
+		},
+	}
+
+	filter2 := NewTransformation(NewSpEnrichedFilterFunction("app_id==test-data1|test-data2"))
+	filter2Res := filter2(messages)
+
+	assert.Equal(len(expectedFilter2), len(filter2Res.Result))
+	assert.Equal(1, len(filter2Res.Invalid))
+
+	var expectedFilter3 = []*models.Message{
+		{
+			Data:         snowplowTsv3,
+			PartitionKey: "some-key3",
+		},
+	}
+
+	filter3 := NewTransformation(NewSpEnrichedFilterFunction("app_id!=test-data1|test-data2"))
+	filter3Res := filter3(messages)
+
+	assert.Equal(len(expectedFilter3), len(filter3Res.Result))
+	assert.Equal(1, len(filter3Res.Invalid))
+
+	/*
+		for index, value := range enrichJsonRes.Result {
+			assert.Equal(expectedGood[index].Data, value.Data)
+			assert.Equal(expectedGood[index].PartitionKey, value.PartitionKey)
+			assert.NotNil(expectedGood[index].TimeTransformed)
+
+			// assertions to ensure we don't accidentally modify the input
+			assert.NotEqual(messages[index].Data, value.Data)
+			// assert can't seem to deal with comparing zero value to non-zero value, so assert that it's still zero instead
+			assert.Equal(time.Time{}, messages[index].TimeTransformed)
+		}
+
+		// Not matching equivalence of whole object because error stacktrace makes it unfeasible. Doing each component part instead.
+		assert.Equal(1, len(enrichJsonRes.Invalid))
+		assert.Equal(int64(1), enrichJsonRes.InvalidCount)
+		assert.Equal("Cannot parse tsv event - wrong number of fields provided: 4", enrichJsonRes.Invalid[0].GetError().Error())
+		assert.Equal([]byte("not	a	snowplow	event"), enrichJsonRes.Invalid[0].Data)
+		assert.Equal("some-key4", enrichJsonRes.Invalid[0].PartitionKey)
+	*/
+}
+
+// TODO: add tests checking slice of messages against output.
diff --git a/pkg/transform/snowplow_enriched_set_pk_test.go b/pkg/transform/snowplow_enriched_set_pk_test.go
@@ -31,7 +31,7 @@ func TestNewSpEnrichedSetPkFunction(t *testing.T) {
 
 	stringAsPk, fail, intermediate := aidSetPkFunc(&messageGood, nil)
 
-	assert.Equal("test-data", stringAsPk.PartitionKey)
+	assert.Equal("test-data3", stringAsPk.PartitionKey)
 	assert.Equal(spTsv3Parsed, intermediate)
 	assert.Nil(fail)
 
@@ -68,7 +68,7 @@ func TestNewSpEnrichedSetPkFunction(t *testing.T) {
 
 	expected := models.Message{
 		Data:         snowplowTsv1,
-		PartitionKey: "test-data",
+		PartitionKey: "test-data1",
 	}
 	incompatibleIntermediate := "Incompatible intermediate state"
 

diff --git a/pkg/transform/transform_test.go b/pkg/transform/transform_test.go
@@ -14,25 +14,6 @@ import (
 	"github.com/stretchr/testify/assert"
 )
 
-var messages = []*models.Message{
-	{
-		Data:         snowplowTsv1,
-		PartitionKey: "some-key",
-	},
-	{
-		Data:         snowplowTsv2,
-		PartitionKey: "some-key1",
-	},
-	{
-		Data:         snowplowTsv3,
-		PartitionKey: "some-key2",
-	},
-	{
-		Data:         nonSnowplowString,
-		PartitionKey: "some-key4",
-	},
-}
-
 // To test a function which creates a function, we're creating the function then testing that. Not sure if there's a better way?
 func TestNewTransformation_Passthrough(t *testing.T) {
 	assert := assert.New(t)
@@ -104,21 +85,39 @@ func TestNewTransformation_EnrichedToJson(t *testing.T) {
 	assert.Equal("some-key4", enrichJsonRes.Invalid[0].PartitionKey)
 }
 
+func Benchmark_Transform_EnrichToJson(b *testing.B) {
+	tranformEnrichJson := NewTransformation(SpEnrichedToJson)
+	for i := 0; i < b.N; i++ {
+		tranformEnrichJson(messages)
+	}
+}
+
+func testfunc(message *models.Message, intermediateState interface{}) (*models.Message, *models.Message, interface{}) {
+	return message, nil, nil
+}
+
+func Benchmark_Transform_Passthrough(b *testing.B) {
+	tranformPassthrough := NewTransformation(testfunc)
+	for i := 0; i < b.N; i++ {
+		tranformPassthrough(messages)
+	}
+}
+
 func TestNewTransformation_Multiple(t *testing.T) {
 	assert := assert.New(t)
 
 	var expectedGood = []*models.Message{
 		{
 			Data:         snowplowJson1,
-			PartitionKey: "test-data",
+			PartitionKey: "test-data1",
 		},
 		{
 			Data:         snowplowJson2,
-			PartitionKey: "test-data",
+			PartitionKey: "test-data2",
 		},
 		{
 			Data:         snowplowJson3,
-			PartitionKey: "test-data",
+			PartitionKey: "test-data3",
 		},
 	}