From 5b8ac321c6905de062373abf9cecd6180ae26ebb Mon Sep 17 00:00:00 2001 From: Tim Rozet Date: Mon, 4 Nov 2019 11:17:03 -0500 Subject: [PATCH 1/2] Allows specifying socket/numa node for BESS Using "-n" flag a user may now specify the desired socket to allocate huge pages from when using BESS. Also includes a check for worker core existing on the correct socket before trying to launch a worker. If worker is launched on a core on an unused socket GetDefaultPool will crash bessd. Signed-off-by: Tim Rozet --- core/bessctl.cc | 6 ++++++ core/dpdk.cc | 21 ++++++++++++++------- core/dpdk.h | 2 +- core/opts.cc | 13 +++++++++++++ core/opts.h | 1 + core/packet_pool.cc | 32 +++++++++++++++----------------- 6 files changed, 50 insertions(+), 25 deletions(-) diff --git a/core/bessctl.cc b/core/bessctl.cc index 8d63761c9..f311d3bcd 100644 --- a/core/bessctl.cc +++ b/core/bessctl.cc @@ -567,6 +567,12 @@ class BESSControlImpl final : public BESSControl::Service { if (!is_cpu_present(core)) { return return_with_error(response, EINVAL, "Invalid core %d", core); } + + if (rte_lcore_to_socket_id(core) != (unsigned int) FLAGS_n) { + return return_with_error(response, EINVAL, "Invalid core %d not on socket %d", + core, FLAGS_n); + } + if (is_worker_active(wid)) { return return_with_error(response, EEXIST, "worker:%d is already active", wid); diff --git a/core/dpdk.cc b/core/dpdk.cc index c2a220cb4..63cc05d76 100644 --- a/core/dpdk.cc +++ b/core/dpdk.cc @@ -98,7 +98,7 @@ class CmdLineOpts { std::vector argv_; }; -void init_eal(int dpdk_mb_per_socket, std::string nonworker_corelist) { +void init_eal(int dpdk_mb_per_socket, int socket, std::string nonworker_corelist) { CmdLineOpts rte_args{ "bessd", "--master-lcore", @@ -118,11 +118,18 @@ void init_eal(int dpdk_mb_per_socket, std::string nonworker_corelist) { // memory in advance. We allocate 512MB (this is shared among nodes). rte_args.Append({"-m", "512"}); } else { - std::string opt_socket_mem = std::to_string(dpdk_mb_per_socket); - for (int i = 1; i < NumNumaNodes(); i++) { - opt_socket_mem += "," + std::to_string(dpdk_mb_per_socket); + std::string opt_socket_mem; + for (int i = 0; i < NumNumaNodes(); i++) { + if (i == socket) { + opt_socket_mem += std::to_string(dpdk_mb_per_socket); + } else { + opt_socket_mem += "0"; + } + if (i+1 != NumNumaNodes()) { + opt_socket_mem += ","; + } } - + LOG(INFO) << "socket mem set as: " << opt_socket_mem; rte_args.Append({"--socket-mem", opt_socket_mem}); // Unlink mapped hugepage files so that memory can be reclaimed as soon as @@ -213,13 +220,13 @@ bool IsDpdkInitialized() { return is_initialized; } -void InitDpdk(int dpdk_mb_per_socket) { +void InitDpdk(int dpdk_mb_per_socket, int socket) { current_worker.SetNonWorker(); if (!is_initialized) { is_initialized = true; LOG(INFO) << "Initializing DPDK"; - init_eal(dpdk_mb_per_socket, GetNonWorkerCoreList()); + init_eal(dpdk_mb_per_socket, socket, GetNonWorkerCoreList()); } } diff --git a/core/dpdk.h b/core/dpdk.h index 6ae3ad7fe..3ef0d2732 100644 --- a/core/dpdk.h +++ b/core/dpdk.h @@ -37,7 +37,7 @@ bool IsDpdkInitialized(); // Initialize DPDK, with the specified amount of hugepage memory. // Safe to call multiple times. -void InitDpdk(int dpdk_mb_per_socket = 0); +void InitDpdk(int dpdk_mb_per_socket = 0, int socket = 0); } // namespace bess diff --git a/core/opts.cc b/core/opts.cc index b4e95abb5..4ccbd4bd0 100644 --- a/core/opts.cc +++ b/core/opts.cc @@ -35,6 +35,7 @@ #include "bessd.h" #include "worker.h" +#include "memory.h" // Port this BESS instance listens on. // Panda came up with this default number @@ -106,6 +107,18 @@ DEFINE_int32(m, 1024, static const bool _m_dummy[[maybe_unused]] = google::RegisterFlagValidator(&FLAGS_m, &ValidateMegabytesPerSocket); +static bool ValidateNumaNode(const char *, int32_t value) { + if (value < 0 || value >= bess::NumNumaNodes()) { + LOG(ERROR) << "Invalid Numa node: " << value; + return false; + } + return true; +} +DEFINE_int32(n, 0, + "Specifies which numa node to run BESS on"); +static const bool _n_dummy[[maybe_unused]] = + google::RegisterFlagValidator(&FLAGS_n, &ValidateNumaNode); + static bool ValidateBuffersPerSocket(const char *, int32_t value) { if (value <= 0) { LOG(ERROR) << "Invalid number of buffers: " << value; diff --git a/core/opts.h b/core/opts.h index dc29e6bd4..f8cd15fca 100644 --- a/core/opts.h +++ b/core/opts.h @@ -44,6 +44,7 @@ DECLARE_string(b); DECLARE_int32(p); DECLARE_string(grpc_url); DECLARE_int32(m); +DECLARE_int32(n); DECLARE_bool(skip_root_check); DECLARE_string(modules); DECLARE_bool(core_dump); diff --git a/core/packet_pool.cc b/core/packet_pool.cc index 7318cee65..c615d6446 100644 --- a/core/packet_pool.cc +++ b/core/packet_pool.cc @@ -36,27 +36,25 @@ void DoMunmap(rte_mempool_memhdr *memhdr, void *) { PacketPool *PacketPool::default_pools_[RTE_MAX_NUMA_NODES]; void PacketPool::CreateDefaultPools(size_t capacity) { - InitDpdk(FLAGS_dpdk ? FLAGS_m : 0); + InitDpdk(FLAGS_dpdk ? FLAGS_m : 0, FLAGS_n); rte_dump_physmem_layout(stdout); - for (int sid = 0; sid < NumNumaNodes(); sid++) { - if (FLAGS_m == 0) { - LOG(WARNING) << "Hugepage is disabled! Creating PlainPacketPool for " - << capacity << " packets on node " << sid; - default_pools_[sid] = new PlainPacketPool(capacity, sid); - } else if (FLAGS_dpdk) { - LOG(INFO) << "Creating DpdkPacketPool for " << capacity - << " packets on node " << sid; - default_pools_[sid] = new DpdkPacketPool(capacity, sid); - } else { - LOG(INFO) << "Creating BessPacketPool for " << capacity - << " packets on node " << sid; - default_pools_[sid] = new BessPacketPool(capacity, sid); - } - CHECK(default_pools_[sid]) - << "Packet pool allocation on node " << sid << " failed!"; + if (FLAGS_m == 0) { + LOG(WARNING) << "Hugepage is disabled! Creating PlainPacketPool for " + << capacity << " packets on node " << FLAGS_n; + default_pools_[FLAGS_n] = new PlainPacketPool(capacity, FLAGS_n); + } else if (FLAGS_dpdk) { + LOG(INFO) << "Creating DpdkPacketPool for " << capacity + << " packets on node " << FLAGS_n; + default_pools_[FLAGS_n] = new DpdkPacketPool(capacity, FLAGS_n); + } else { + LOG(INFO) << "Creating BessPacketPool for " << capacity + << " packets on node " << FLAGS_n; + default_pools_[FLAGS_n] = new BessPacketPool(capacity, FLAGS_n); } + CHECK(default_pools_[FLAGS_n]) + << "Packet pool allocation on node " << FLAGS_n << " failed!"; } PacketPool::PacketPool(size_t capacity, int socket_id) { From 105f2bb35338733231eeb0804132b137bdf0b87f Mon Sep 17 00:00:00 2001 From: Tim Rozet Date: Wed, 6 Nov 2019 10:46:29 -0500 Subject: [PATCH 2/2] Fixes socket specified for worker and pmd Assumptions were being made to use socket 0 when detection for socket id failed. This patch defaults to the socket specified at runtime. Signed-off-by: Tim Rozet --- core/drivers/pmd.cc | 20 +++++++++++--------- core/worker.cc | 17 +++++++++++------ 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/core/drivers/pmd.cc b/core/drivers/pmd.cc index 3ed683156..c1090e98c 100644 --- a/core/drivers/pmd.cc +++ b/core/drivers/pmd.cc @@ -34,6 +34,7 @@ #include "../utils/ether.h" #include "../utils/format.h" +#include "../opts.h" /*! * The following are deprecated. Ignore us. @@ -266,11 +267,19 @@ CommandResponse PMDPort::Init(const bess::pb::PMDPortArg &arg) { } rte_eth_promiscuous_enable(ret_port_id); + int sid = rte_eth_dev_socket_id(ret_port_id); + + /* if socket_id is invalid, set to arg */ + if (sid < 0 || sid > RTE_MAX_NUMA_NODES) { + sid = FLAGS_n; + LOG(WARNING) << "Unable to detect socket ID, defaulting to: " << sid; + } else { + LOG(INFO) << "socket ID in pmd is: " < RTE_MAX_NUMA_NODES) { - sid = 0; - } - ret = rte_eth_rx_queue_setup(ret_port_id, i, queue_size[PACKET_DIR_INC], sid, ð_rxconf, bess::PacketPool::GetDefaultPool(sid)->pool()); diff --git a/core/worker.cc b/core/worker.cc index c7c1c60f7..a73e5cfb5 100644 --- a/core/worker.cc +++ b/core/worker.cc @@ -290,15 +290,20 @@ void *Worker::Run(void *_arg) { /* for workers, wid == rte_lcore_id() */ wid_ = arg->wid; core_ = arg->core; - socket_ = rte_socket_id(); + socket_ = FLAGS_n; - // For some reason, rte_socket_id() does not return a correct NUMA ID. - // Nevertheless, BESS should not crash. - if (socket_ == SOCKET_ID_ANY) { - LOG(WARNING) << "rte_socket_id() returned -1 for " << arg->core; - socket_ = 0; + if (rte_lcore_to_socket_id((unsigned int) core_) != (unsigned int) socket_) { + LOG(ERROR) << "Core specified: " << core_ << " does not exist on socket: " + << socket_ << ". Cannot create worker"; + + delete scheduler_; + delete rand_; + + return nullptr; } + LOG(INFO) << "Running worker: " << wid_ << " on core " << core_ << " on socket " << socket_; + fd_event_ = eventfd(0, 0); CHECK_GE(fd_event_, 0);