From 4ec02286e80be00cc7d0bad91497eece4ea237bb Mon Sep 17 00:00:00 2001 From: mingji Date: Tue, 31 Dec 2024 15:37:28 +0800 Subject: [PATCH] [CELEBORN-1811] Update default value for `celeborn.master.slot.assign.extraSlots` ### What changes were proposed in this pull request? To avoid possible worker load skew for the stages with tiny reducer numbers. ### Why are the changes needed? If a stage has tiny reducers and skewed partitions, The default value will lead to serious worker load imbalance cause some workers unable to handle shuffle data. ### Does this PR introduce _any_ user-facing change? Yes ### How was this patch tested? GA and cluster test. Closes #3039 from FMX/1811. Authored-by: mingji Signed-off-by: SteNicholas --- .../main/scala/org/apache/celeborn/common/CelebornConf.scala | 2 +- docs/configuration/master.md | 2 +- docs/migration.md | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala index d7c4b058f04..a8cdf8b7d09 100644 --- a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala +++ b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala @@ -2883,7 +2883,7 @@ object CelebornConf extends Logging { .version("0.3.0") .doc("Extra slots number when master assign slots.") .intConf - .createWithDefault(2) + .createWithDefault(100) val MASTER_SLOT_ASSIGN_MAX_WORKERS: ConfigEntry[Int] = buildConf("celeborn.master.slot.assign.maxWorkers") diff --git a/docs/configuration/master.md b/docs/configuration/master.md index 5368296a779..894acced7ad 100644 --- a/docs/configuration/master.md +++ b/docs/configuration/master.md @@ -72,7 +72,7 @@ license: | | celeborn.master.port | 9097 | false | Port for master to bind. | 0.2.0 | | | celeborn.master.rackResolver.refresh.interval | 30s | false | Interval for refreshing the node rack information periodically. | 0.5.0 | | | celeborn.master.send.applicationMeta.threads | 8 | false | Number of threads used by the Master to send ApplicationMeta to Workers. | 0.5.0 | | -| celeborn.master.slot.assign.extraSlots | 2 | false | Extra slots number when master assign slots. | 0.3.0 | celeborn.slots.assign.extraSlots | +| celeborn.master.slot.assign.extraSlots | 100 | false | Extra slots number when master assign slots. | 0.3.0 | celeborn.slots.assign.extraSlots | | celeborn.master.slot.assign.loadAware.diskGroupGradient | 0.1 | false | This value means how many more workload will be placed into a faster disk group than a slower group. | 0.3.0 | celeborn.slots.assign.loadAware.diskGroupGradient | | celeborn.master.slot.assign.loadAware.fetchTimeWeight | 1.0 | false | Weight of average fetch time when calculating ordering in load-aware assignment strategy | 0.3.0 | celeborn.slots.assign.loadAware.fetchTimeWeight | | celeborn.master.slot.assign.loadAware.flushTimeWeight | 0.0 | false | Weight of average flush time when calculating ordering in load-aware assignment strategy | 0.3.0 | celeborn.slots.assign.loadAware.flushTimeWeight | diff --git a/docs/migration.md b/docs/migration.md index 7390fe8d717..e9ceb8c0376 100644 --- a/docs/migration.md +++ b/docs/migration.md @@ -23,6 +23,8 @@ license: | # Upgrading from 0.5 to 0.6 +- Since 0.6.0, Celeborn changed the default value of `celeborn.master.slot.assign.extraSlots` from `2` to `100`, which means Celeborn will involve more workers in offering slots. + - Since 0.6.0, Celeborn deprecate `celeborn.worker.congestionControl.low.watermark`. Please use `celeborn.worker.congestionControl.diskBuffer.low.watermark` instead. - Since 0.6.0, Celeborn deprecate `celeborn.worker.congestionControl.high.watermark`. Please use `celeborn.worker.congestionControl.diskBuffer.high.watermark` instead.