-
Notifications
You must be signed in to change notification settings - Fork 708
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding second moment of values per key for Typed-API reduce operations #1279
base: develop
Are you sure you want to change the base?
Changes from all commits
05df02b
d4aff6b
c8eebf8
b48da0b
0f495b1
58c3f12
4e3db4f
3f5cb5e
2b767c0
1a8d6e2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -476,7 +476,21 @@ package com.twitter.scalding { | |
} | ||
} | ||
|
||
/** In the typed API every reduce operation is handled by this Buffer */ | ||
private[scalding] object SkewMonitorCounters { | ||
// Strangely, if group name and key name are different, then the counter would be zero | ||
val KeyCount = "scalding.debug.reduce.key.count" | ||
val ValuesCountSum = "scalding.debug.reduce.value.count.sum" | ||
val ValuesCountSquareSum = "scalding.debug.reduce.value.count.sum.square" | ||
} | ||
|
||
class CountingIterator[T](wraps: Iterator[T]) extends Iterator[T] { | ||
// This Wrapper lets us know how many values have been iterated in an Iterator | ||
private[this] var nextCalls = 0L | ||
def hasNext = wraps.hasNext | ||
def next = { nextCalls += 1; wraps.next } | ||
def seen: Long = nextCalls | ||
} | ||
|
||
class TypedBufferOp[K, V, U]( | ||
conv: TupleConverter[K], | ||
@transient reduceFn: (K, Iterator[V]) => Iterator[U], | ||
|
@@ -487,9 +501,9 @@ package com.twitter.scalding { | |
def operate(flowProcess: FlowProcess[_], call: BufferCall[Any]) { | ||
val oc = call.getOutputCollector | ||
val key = conv(call.getGroup) | ||
val values = call.getArgumentsIterator | ||
val values = new CountingIterator(call.getArgumentsIterator | ||
.asScala | ||
.map(_.getObject(0).asInstanceOf[V]) | ||
.map(_.getObject(0).asInstanceOf[V])) | ||
|
||
// Avoiding a lambda here | ||
val resIter = reduceFnSer.get(key, values) | ||
|
@@ -498,6 +512,17 @@ package com.twitter.scalding { | |
tup.set(0, resIter.next) | ||
oc.add(tup) | ||
} | ||
|
||
val numValuesPerKey = values.seen | ||
|
||
flowProcess.increment(SkewMonitorCounters.KeyCount, SkewMonitorCounters.KeyCount, 1L) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. probably better to use a group like "scalding.debug" since I think we want to group them up so they are easy to see next to each other in the job tracker UI. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we want the group and counter to be the same string. We want the group to be something like "scalding debug" and the counter value is fine. |
||
flowProcess.increment(SkewMonitorCounters.ValuesCountSum, SkewMonitorCounters.ValuesCountSum, numValuesPerKey) | ||
flowProcess.increment(SkewMonitorCounters.ValuesCountSquareSum, SkewMonitorCounters.ValuesCountSquareSum, numValuesPerKey * numValuesPerKey) | ||
|
||
// Uncomment the following to trigger the bug | ||
//flowProcess.increment("TestGroup", "TestKey", 1) | ||
|
||
} | ||
|
||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is bug by itself. Can you confirm this is still true and not due to another issue? We don't want to make a bunch of groups.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems to be a bug. if you simply put
flowProcess.increment("TestGroup", "TestKey", 1)
in def operate(flowProcess: FlowProcess[_], call: BufferCall[Any]) {
and this should print 3 in the test
ReduceValueCounterTest. However it prints
PRINTING KEY AND GROUP! 0