Skip to content

Commit

Permalink
Test for AQL packet decoding for SQ Waves (#765)
Browse files Browse the repository at this point in the history
Checks the decoding of AQL packets for SQ Waves by
launching a kernel, injecting the AQL packets, and
decoding the result. This does not use write interceptor
but does this check on a raw HSA stream with direct
injection.
  • Loading branch information
bwelton authored Oct 21, 2024
1 parent d5b7c39 commit c6ae396
Show file tree
Hide file tree
Showing 3 changed files with 203 additions and 5 deletions.
174 changes: 173 additions & 1 deletion source/lib/rocprofiler-sdk/counters/tests/device_counting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ class device_counting_service_test : public ::testing::Test
aql::set_profiler_active_on_queue(
agent.cpu_pool(), agent.get_hsa_agent(), [&](hsa::rocprofiler_packet pkt) {
pkt.ext_amd_aql_pm4.completion_signal = completion_signal;
submitPacket(queue, (void*) &pkt);
submitPacket(queue, (const void*) &pkt);

if(hsa_signal_wait_relaxed(completion_signal,
HSA_SIGNAL_CONDITION_EQ,
Expand Down Expand Up @@ -425,6 +425,173 @@ class device_counting_service_test : public ::testing::Test
registration::set_init_status(1);
context::pop_client(1);
}

// Inject AQL Packets directly into a userspace queue. This tests that the packets
// we get from AQLProfile work as expected. A failure in this test means that the AQL
// packets are likely not valid.
static void check_raw_aql_packets(const std::string& metric_to_test,
size_t iter_count,
const std::vector<double>& expected_values)
{
using namespace rocprofiler::counters;
using namespace rocprofiler::hsa;

auto header_pkt = [](hsa_packet_type_t type) {
uint16_t header = type << HSA_PACKET_HEADER_TYPE;
header |= 1 << HSA_PACKET_HEADER_BARRIER;
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE;
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE;
return header;
};

registration::init_logging();
registration::set_init_status(-1);
context::push_client(1);

CHECK_EQ(hsa_init(), HSA_STATUS_SUCCESS);
test_init();

const auto& supported_agents = hsa::get_queue_controller()->get_supported_agents();
ASSERT_GT(supported_agents.size(), 0);

int* gpuMem;
[[maybe_unused]] hipDeviceProp_t devProp;
auto status = hipGetDeviceProperties(&devProp, 0);
CHECK_EQ(status, HSA_STATUS_SUCCESS);
status = hipMalloc((void**) &gpuMem, 1 * sizeof(int));
CHECK_EQ(status, HSA_STATUS_SUCCESS);

bool test_ran = false;
CHECK(!supported_agents.empty());
for(const auto& [_, gpu_agent] : supported_agents)
{
test_kernels kernel_loader(gpu_agent);
auto kernel_handle = kernel_loader.load_kernel(gpu_agent, "null_kernel");

ROCP_ERROR << fmt::format("Running test on agent {:x}",
gpu_agent.get_hsa_agent().handle);

const auto* agent_map = rocprofiler::common::get_val(counters::get_ast_map(),
std::string(gpu_agent.name()));
CHECK(agent_map);
const auto* original_ast = rocprofiler::common::get_val(*agent_map, metric_to_test);
CHECK(original_ast);
auto counter_ast = *original_ast;
std::set<counters::Metric> required_counters;
counter_ast.get_required_counters(*agent_map, required_counters);
std::vector<counters::Metric> req_cnt(required_counters.begin(),
required_counters.end());
CHECK(!req_cnt.empty());
aql::CounterPacketConstruct pkt_constructor(gpu_agent.get_rocp_agent()->id, req_cnt);

// Construct the queue to test on
hsa_queue_t* queue;
CHECK_EQ(hsa_queue_create(gpu_agent.get_hsa_agent(),
1024,
HSA_QUEUE_TYPE_SINGLE,
nullptr,
nullptr,
UINT32_MAX,
UINT32_MAX,
&queue),
HSA_STATUS_SUCCESS);

auto kern_pkt = gen_kernel_pkt(kernel_handle);
auto inst_pkts = pkt_constructor.construct_packet(get_api_table(), get_ext_table());
inst_pkts->packets.start_packet.header = header_pkt(HSA_PACKET_TYPE_VENDOR_SPECIFIC);
inst_pkts->packets.start_packet.completion_signal.handle = 0;
inst_pkts->packets.stop_packet.header = header_pkt(HSA_PACKET_TYPE_VENDOR_SPECIFIC);
inst_pkts->packets.read_packet.completion_signal.handle = 0;
inst_pkts->packets.read_packet.header = header_pkt(HSA_PACKET_TYPE_VENDOR_SPECIFIC);

std::vector<rocprofiler_packet> packets;
packets.emplace_back().ext_amd_aql_pm4 = inst_pkts->packets.start_packet;
packets.emplace_back() = kern_pkt;
packets.emplace_back().ext_amd_aql_pm4 = inst_pkts->packets.read_packet;
packets.emplace_back().ext_amd_aql_pm4 = inst_pkts->packets.stop_packet;

// Insert barriers for all packets
auto blocked_packets = [&]() {
std::vector<rocprofiler_packet> blocked;
for(auto& pkt : packets)
{
rocprofiler_packet barrier{};
hsa_signal_t block_signal;
hsa_signal_create(1, 0, nullptr, &block_signal);
pkt.ext_amd_aql_pm4.completion_signal.handle = block_signal.handle;
blocked.push_back(pkt);

barrier.barrier_and.header = header_pkt(HSA_PACKET_TYPE_BARRIER_AND);
barrier.barrier_and.dep_signal[0] = block_signal;
barrier.barrier_and.completion_signal.handle = block_signal.handle;
blocked.push_back(barrier);
}
return blocked;
}();

CHECK(inst_pkts);

for(size_t i = 0; i < iter_count; i++)
{
for(auto& pkt : blocked_packets)
{
hsa_signal_store_screlease(pkt.ext_amd_aql_pm4.completion_signal, 1);
}

for(auto& pkt : blocked_packets)
{
::submitPacket(queue, (const void*) &pkt.ext_amd_aql_pm4);
}
hsa_signal_wait_relaxed(blocked_packets.back().ext_amd_aql_pm4.completion_signal,
HSA_SIGNAL_CONDITION_EQ,
-1,
UINT32_MAX,
HSA_WAIT_STATE_ACTIVE);

ROCP_ERROR << "Processing Next...";
auto decoded_pkt = counters::EvaluateAST::read_pkt(&pkt_constructor, *inst_pkts);
CHECK(!decoded_pkt.empty());
ROCP_ERROR << "Decoded Packet:";
for(const auto& [id, data_vec] : decoded_pkt)
{
ROCP_ERROR << fmt::format("\t[{} = {}]", id, fmt::join(data_vec, ","));
}

std::vector<std::unique_ptr<std::vector<rocprofiler_record_counter_t>>> cache;
auto* ret = counter_ast.evaluate(decoded_pkt, cache);
CHECK(!ret->empty());
ROCP_ERROR << fmt::format(
"Final Decoded Counter Values: {} (iter={})", fmt::join(*ret, ","), i);

CHECK_EQ(ret->size(), expected_values.size());
size_t pos = 0;
for(const auto& v : expected_values)
{
CHECK_EQ(v, ret->at(pos).counter_value);
pos++;
}
}

std::set<uint64_t> signals_deleted;
for(auto& pkt : packets)
{
if(signals_deleted.find(pkt.ext_amd_aql_pm4.completion_signal.handle) ==
signals_deleted.end())
{
hsa_signal_destroy(pkt.ext_amd_aql_pm4.completion_signal);
signals_deleted.insert(pkt.ext_amd_aql_pm4.completion_signal.handle);
}
}
test_ran = true;
}

CHECK_EQ(hipFree(gpuMem), hipSuccess);

CHECK(test_ran);

registration::set_init_status(1);
registration::finalize();
}
};

TEST_F(device_counting_service_test, sync_counters) { test_run(); }
Expand Down Expand Up @@ -476,3 +643,8 @@ TEST_F(device_counting_service_test, sync_sq_waves_verify)
EXPECT_GT(val.counter_value, 0.0);
}
}

TEST_F(device_counting_service_test, raw_sq_waves_verify)
{
check_raw_aql_packets("SQ_WAVES_sum", 1000, {1.0});
}
23 changes: 23 additions & 0 deletions source/lib/rocprofiler-sdk/hsa/queue_controller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,18 @@ QueueController::init(CoreApiTable& core_table, AmdExtTable& ext_table)
for(const auto* itr : agents)
{
const auto* cached_agent = agent::get_agent_cache(itr);
ROCP_TRACE << fmt::format(
"RocP Agent {:x} has Cache Agent? {}", itr->id.handle, cached_agent ? "yes" : "no");
if(cached_agent)
{
ROCP_TRACE << fmt::format("RocP Agent {:x} Type {}",
itr->id.handle,
(int) cached_agent->get_rocp_agent()->type);
}

if(cached_agent && cached_agent->get_rocp_agent()->type == ROCPROFILER_AGENT_TYPE_GPU)
{
ROCP_TRACE << fmt::format("RocP Agent {:x} is added to cache", itr->id.handle);
get_supported_agents().emplace(cached_agent->index(), *cached_agent);
}
}
Expand Down Expand Up @@ -378,10 +388,23 @@ QueueController::iterate_callbacks(const callback_iterator_cb_t& cb) const
});
}

const QueueController::agent_cache_map_t&
QueueController::get_supported_agents() const
{
return _supported_agents;
}

QueueController::agent_cache_map_t&
QueueController::get_supported_agents()
{
return _supported_agents;
}

QueueController*
get_queue_controller()
{
static auto*& controller = common::static_object<QueueController>::construct();
LOG(ERROR) << (uint64_t) controller;
return controller;
}

Expand Down
11 changes: 7 additions & 4 deletions source/lib/rocprofiler-sdk/hsa/queue_controller.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,10 @@ class QueueController
using queue_iterator_cb_t = std::function<void(const Queue*)>;
using callback_iterator_cb_t = std::function<void(ClientID, const agent_callback_tuple_t&)>;
using queue_map_t = std::unordered_map<hsa_queue_t*, std::unique_ptr<Queue>>;
using agent_cache_map_t = std::unordered_map<uint32_t, AgentCache>;

QueueController() = default;
~QueueController() { ROCP_ERROR << "Destroying Queue"; }
// Initializes the QueueInterceptor. This must be delayed until
// HSA has been inited.
void init(CoreApiTable& core_table, AmdExtTable& ext_table);
Expand All @@ -68,8 +70,9 @@ class QueueController
const AmdExtTable& get_ext_table() const { return _ext_table; }

// Gets the list of supported HSA agents that can be Pintercepted
const auto& get_supported_agents() const { return _supported_agents; }
auto& get_supported_agents() { return _supported_agents; }
const agent_cache_map_t& get_supported_agents() const;

agent_cache_map_t& get_supported_agents();

const Queue* get_queue(const hsa_queue_t&) const;

Expand Down Expand Up @@ -101,8 +104,8 @@ class QueueController
#endif

private:
using client_id_map_t = std::unordered_map<ClientID, agent_callback_tuple_t>;
using agent_cache_map_t = std::unordered_map<uint32_t, AgentCache>;
using client_id_map_t = std::unordered_map<ClientID, agent_callback_tuple_t>;
using resource_alloc_t = void(const AgentCache&, const CoreApiTable&, const AmdExtTable&);

CoreApiTable _core_table = {};
AmdExtTable _ext_table = {};
Expand Down

0 comments on commit c6ae396

Please sign in to comment.