Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add network quality of service support #42

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
*.a
*.d
tracer/traceR
bin/traceR

# hide backups
*~
Expand Down
265 changes: 265 additions & 0 deletions docs/scorep-5.0.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
diff -Naur scorep-5.0.original/src/adapters/mpi/SCOREP_Mpi.h scorep-5.0/src/adapters/mpi/SCOREP_Mpi.h
--- scorep-5.0.original/src/adapters/mpi/SCOREP_Mpi.h 2019-03-22 10:41:49.127806703 -0700
+++ scorep-5.0/src/adapters/mpi/SCOREP_Mpi.h 2019-04-03 01:58:09.441425000 -0700
@@ -119,6 +119,8 @@
* Flag which indicates whether event generation is turned on/off.
*/
extern bool scorep_mpi_generate_events;
+extern bool scorep_mpi_reduce_probe_test;
+extern uint64_t scorep_mpi_last_probe_test_evt;

/**
Flag which indicates whether recording of MPI topologies is enabled.
diff -Naur scorep-5.0.original/src/adapters/mpi/SCOREP_Mpi_P2p.c scorep-5.0/src/adapters/mpi/SCOREP_Mpi_P2p.c
--- scorep-5.0.original/src/adapters/mpi/SCOREP_Mpi_P2p.c 2019-03-22 10:41:49.127806703 -0700
+++ scorep-5.0/src/adapters/mpi/SCOREP_Mpi_P2p.c 2019-04-24 14:22:41.803509000 -0700
@@ -74,6 +74,8 @@
*/
static int scorep_mpi_status_array_size = 0;

+extern uint64_t SCOREP_get_number_of_events();
+
/**
* Get a pointer to a status array of at least 'size' statuses
* @param size minimal requested size
@@ -1444,8 +1446,11 @@
const int event_gen_active = SCOREP_MPI_IS_EVENT_GEN_ON;
const int event_gen_active_for_group = SCOREP_MPI_IS_EVENT_GEN_ON_FOR( SCOREP_MPI_ENABLED_P2P );
int return_val;
+ uint64_t num_evts_written = SCOREP_get_number_of_events();
+ bool gen_region = !scorep_mpi_reduce_probe_test ||
+ (num_evts_written != scorep_mpi_last_probe_test_evt);

- if ( event_gen_active )
+ if ( event_gen_active && gen_region )
{
SCOREP_MPI_EVENT_GEN_OFF();
if ( event_gen_active_for_group )
@@ -1462,7 +1467,7 @@
return_val = PMPI_Iprobe( source, tag, comm, flag, status );
SCOREP_EXIT_WRAPPED_REGION();

- if ( event_gen_active )
+ if ( event_gen_active && gen_region )
{
if ( event_gen_active_for_group )
{
@@ -1473,6 +1478,7 @@
SCOREP_ExitWrapper( scorep_mpi_regions[ SCOREP_MPI_REGION__MPI_IPROBE ] );
}
SCOREP_MPI_EVENT_GEN_ON();
+ scorep_mpi_last_probe_test_evt = SCOREP_get_number_of_events();
}

SCOREP_IN_MEASUREMENT_DECREMENT();
@@ -1918,8 +1924,11 @@
scorep_mpi_request* orig_req;
MPI_Status mystatus;
uint64_t start_time_stamp;
+ uint64_t num_evts_written = SCOREP_get_number_of_events();
+ bool gen_region = !scorep_mpi_reduce_probe_test ||
+ (num_evts_written != scorep_mpi_last_probe_test_evt);

- if ( event_gen_active )
+ if ( event_gen_active && gen_region )
{
SCOREP_MPI_EVENT_GEN_OFF();

@@ -1963,8 +1972,9 @@
SCOREP_MpiRequestTested( orig_req->id );
}

- if ( event_gen_active )
+ if ( event_gen_active && gen_region )
{
+ bool updateEvt = (scorep_mpi_last_probe_test_evt == SCOREP_get_number_of_events());
if ( event_gen_active_for_group )
{
SCOREP_ExitRegion( scorep_mpi_regions[ SCOREP_MPI_REGION__MPI_TEST ] );
@@ -1975,6 +1985,9 @@
}

SCOREP_MPI_EVENT_GEN_ON();
+ if(updateEvt) {
+ scorep_mpi_last_probe_test_evt = SCOREP_get_number_of_events();
+ }
}

SCOREP_IN_MEASUREMENT_DECREMENT();
@@ -2006,8 +2019,11 @@
scorep_mpi_request* orig_req;
MPI_Status mystatus;
uint64_t start_time_stamp;
+ uint64_t num_evts_written = SCOREP_get_number_of_events();
+ bool gen_region = !scorep_mpi_reduce_probe_test ||
+ (num_evts_written != scorep_mpi_last_probe_test_evt);

- if ( event_gen_active )
+ if ( event_gen_active && gen_region )
{
SCOREP_MPI_EVENT_GEN_OFF();

@@ -2019,6 +2035,7 @@
{
SCOREP_EnterWrapper( scorep_mpi_regions[ SCOREP_MPI_REGION__MPI_TESTANY ] );
}
+ scorep_mpi_last_probe_test_evt = SCOREP_get_number_of_events();
}

#if !defined( SCOREP_MPI_NO_HOOKS )
@@ -2072,8 +2089,9 @@
#endif
scorep_mpi_check_request( orig_req, status );
}
- if ( event_gen_active )
+ if ( event_gen_active && gen_region )
{
+ bool updateEvt = (scorep_mpi_last_probe_test_evt == SCOREP_get_number_of_events());
if ( event_gen_active_for_group )
{
SCOREP_ExitRegion( scorep_mpi_regions[ SCOREP_MPI_REGION__MPI_TESTANY ] );
@@ -2084,6 +2102,9 @@
}

SCOREP_MPI_EVENT_GEN_ON();
+ if(updateEvt) {
+ scorep_mpi_last_probe_test_evt = SCOREP_get_number_of_events();
+ }
}

SCOREP_IN_MEASUREMENT_DECREMENT();
@@ -2114,8 +2135,12 @@
int i;
scorep_mpi_request* orig_req;
uint64_t start_time_stamp;
+ uint64_t num_evts_written = SCOREP_get_number_of_events();
+ bool gen_region = !scorep_mpi_reduce_probe_test ||
+ (num_evts_written != scorep_mpi_last_probe_test_evt);

- if ( event_gen_active )
+
+ if ( event_gen_active && gen_region )
{
SCOREP_MPI_EVENT_GEN_OFF();

@@ -2127,6 +2152,8 @@
{
SCOREP_EnterWrapper( scorep_mpi_regions[ SCOREP_MPI_REGION__MPI_TESTALL ] );
}
+
+ scorep_mpi_last_probe_test_evt = SCOREP_get_number_of_events();
}

#if !defined( SCOREP_MPI_NO_HOOKS )
@@ -2177,8 +2204,9 @@
}
}
}
- if ( event_gen_active )
+ if ( event_gen_active && gen_region )
{
+ bool updateEvt = (scorep_mpi_last_probe_test_evt == SCOREP_get_number_of_events());
if ( event_gen_active_for_group )
{
SCOREP_ExitRegion( scorep_mpi_regions[ SCOREP_MPI_REGION__MPI_TESTALL ] );
@@ -2189,6 +2217,9 @@
}

SCOREP_MPI_EVENT_GEN_ON();
+ if(updateEvt) {
+ scorep_mpi_last_probe_test_evt = SCOREP_get_number_of_events();
+ }
}

SCOREP_IN_MEASUREMENT_DECREMENT();
@@ -2220,8 +2251,11 @@
int i;
scorep_mpi_request* orig_req;
uint64_t start_time_stamp;
+ uint64_t num_evts_written = SCOREP_get_number_of_events();
+ bool gen_region = !scorep_mpi_reduce_probe_test ||
+ (num_evts_written != scorep_mpi_last_probe_test_evt);

- if ( event_gen_active )
+ if ( event_gen_active && gen_region )
{
SCOREP_MPI_EVENT_GEN_OFF();

@@ -2233,6 +2267,7 @@
{
SCOREP_EnterWrapper( scorep_mpi_regions[ SCOREP_MPI_REGION__MPI_TESTSOME ] );
}
+ scorep_mpi_last_probe_test_evt = SCOREP_get_number_of_events();
}

#if !defined( SCOREP_MPI_NO_HOOKS )
@@ -2314,8 +2349,10 @@
}
}

- if ( event_gen_active )
+ if ( event_gen_active && gen_region )
{
+ bool updateEvt = (scorep_mpi_last_probe_test_evt ==
+ SCOREP_get_number_of_events());
if ( event_gen_active_for_group )
{
SCOREP_ExitRegion( scorep_mpi_regions[ SCOREP_MPI_REGION__MPI_TESTSOME ] );
@@ -2326,6 +2363,9 @@
}

SCOREP_MPI_EVENT_GEN_ON();
+ if(updateEvt) {
+ scorep_mpi_last_probe_test_evt = SCOREP_get_number_of_events();
+ }
}

SCOREP_IN_MEASUREMENT_DECREMENT();
diff -Naur scorep-5.0.original/src/adapters/mpi/scorep_mpi_init.c scorep-5.0/src/adapters/mpi/scorep_mpi_init.c
--- scorep-5.0.original/src/adapters/mpi/scorep_mpi_init.c 2019-03-22 10:41:49.131806652 -0700
+++ scorep-5.0/src/adapters/mpi/scorep_mpi_init.c 2019-04-03 01:59:13.540480000 -0700
@@ -308,6 +308,10 @@
mpi_subsystem_begin( void )
{
SCOREP_MPI_EVENT_GEN_ON();
+ char *reduce_logs = getenv("SCOREP_REDUCE_PROBE_TEST");
+ if(reduce_logs != NULL) {
+ scorep_mpi_reduce_probe_test = (atoi(reduce_logs) == 1);
+ }
return SCOREP_SUCCESS;
}

@@ -426,6 +430,8 @@
events are generated.
*/
bool scorep_mpi_generate_events = false;
+bool scorep_mpi_reduce_probe_test = false;
+uint64_t scorep_mpi_last_probe_test_evt = -1;

/**
* @internal
diff -Naur scorep-5.0.original/src/measurement/tracing/SCOREP_Tracing.c scorep-5.0/src/measurement/tracing/SCOREP_Tracing.c
--- scorep-5.0.original/src/measurement/tracing/SCOREP_Tracing.c 2019-03-22 10:41:49.391803294 -0700
+++ scorep-5.0/src/measurement/tracing/SCOREP_Tracing.c 2019-04-03 02:12:25.959430000 -0700
@@ -580,3 +580,11 @@
write_properties();
write_definitions();
}
+
+uint64_t SCOREP_get_number_of_events() {
+ OTF2_EvtWriter* writer = scorep_tracing_get_trace_data( SCOREP_Location_GetCurrentCPULocation() )->otf_writer;
+ if(writer == NULL) return 0;
+ uint64_t num_evts_written;
+ OTF2_EvtWriter_GetNumberOfEvents( writer, &num_evts_written);
+ return num_evts_written;
+}
diff -Naur scorep-5.0.original/src/measurement/tracing/SCOREP_Tracing.h scorep-5.0/src/measurement/tracing/SCOREP_Tracing.h
--- scorep-5.0.original/src/measurement/tracing/SCOREP_Tracing.h 2019-03-22 10:41:49.391803294 -0700
+++ scorep-5.0/src/measurement/tracing/SCOREP_Tracing.h 2019-04-03 02:11:30.732441000 -0700
@@ -86,5 +86,6 @@
void
SCOREP_Tracing_Write( void );

+uint64_t SCOREP_get_number_of_events();

#endif /* SCOREP_TRACING_H */
2 changes: 1 addition & 1 deletion tracer/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ all: traceR

PREFIX = ..

SRCS = p2p-events.C coll-events.C tracer-driver.C
SRCS = p2p-events.C coll-events.C tracer-driver.C qos-manager.C
OBJS = ${SRCS:.C=.o}

traceR: ${OBJS} components
Expand Down
1 change: 1 addition & 0 deletions tracer/Makefile.common
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ SEQ_CXX = ${CXX}
# ROSS install directory
ROSS_DIR = ${HOME}/spack/opt/spack/linux-rhel7-x86_64/gcc-4.9.3/ross-7.0.0


# CODES install directory
CODES_DIR = ${HOME}/spack/opt/spack/linux-rhel7-x86_64/gcc-4.9.3/codes-1.0.0
CODES_LIBS = -L${CODES_DIR}/lib -Wl,-rpath,${CODES_DIR}/lib
Expand Down
10 changes: 10 additions & 0 deletions tracer/coll-events.C
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

extern "C" {
#include "codes/model-net.h"
#include "codes/model-net-sched.h"
#include "codes/lp-io.h"
#include "codes/codes.h"
#include "codes/codes_mapping.h"
Expand All @@ -16,6 +17,9 @@ extern "C" {
}

#include "tracer-driver.h"
#include "qos-manager.h"

extern QoSManager qosManager;

// the indexing should match between the #define and the lookUpTable
#define TRACER_A2A 0
Expand Down Expand Up @@ -155,6 +159,10 @@ void enqueue_coll_msg(
m_local.proc_event_type = lookUpTable[index].local_event;
m_local.executed.taskid = ns->my_pe->currentCollTask;


int prio = qosManager.getServiceLevel(ns->my_job, lpid_to_pe(lp->id), dest);
model_net_set_msg_param(MN_MSG_PARAM_SCHED, MN_SCHED_PARAM_PRIO, (void*)&prio);

model_net_event(net_id, "coll", pe_to_lpid(dest, ns->my_job), size,
sendOffset + copyTime*(isEager?1:0), sizeof(proc_msg),
(const void*)&m_remote, sizeof(proc_msg), &m_local, lp);
Expand Down Expand Up @@ -241,6 +249,8 @@ void handle_coll_recv_post_event(
size = m->msgId.size;
m_remote.msgId.size = size;
}
int prio = qosManager.getServiceLevel(ns->my_job, lpid_to_pe(lp->id), m->msgId.pe);
model_net_set_msg_param(MN_MSG_PARAM_SCHED, MN_SCHED_PARAM_PRIO, (void*)&prio);
model_net_event(net_id, "coll", pe_to_lpid(m->msgId.pe, ns->my_job),
size, nic_delay, sizeof(proc_msg),
(const void*)&m_remote, sizeof(proc_msg), &m_local, lp);
Expand Down
11 changes: 11 additions & 0 deletions tracer/p2p-events.C
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

extern "C" {
#include "codes/model-net.h"
#include "codes/model-net-sched.h"
#include "codes/lp-io.h"
#include "codes/codes.h"
#include "codes/codes_mapping.h"
Expand All @@ -16,6 +17,9 @@ extern "C" {
}

#include "tracer-driver.h"
#include "qos-manager.h"

extern QoSManager qosManager;

void handle_recv_event(
proc_state * ns,
Expand Down Expand Up @@ -841,6 +845,9 @@ int send_msg(
#endif
m_remote.iteration = iter;

int prio = qosManager.getServiceLevel(ns->my_job, lpid_to_pe(lp->id), lpid_to_pe(dest_id));
model_net_set_msg_param(MN_MSG_PARAM_SCHED, MN_SCHED_PARAM_PRIO, (void*)&prio);

/* model_net_event params:
int net_id, char* category, tw_lpid final_dest_lp,
uint64_t message_size, tw_stime offset, int remote_event_size,
Expand Down Expand Up @@ -877,6 +884,10 @@ void enqueue_msg(
#endif
m_remote.iteration = iter;

int prio = qosManager.getServiceLevel(ns->my_job, lpid_to_pe(lp->id), lpid_to_pe(dest_id));
model_net_set_msg_param(MN_MSG_PARAM_SCHED, MN_SCHED_PARAM_PRIO, (void*)&prio);


model_net_event(net_id, "p2p", dest_id, size, sendOffset,
sizeof(proc_msg), (const void*)&m_remote, sizeof(proc_msg), m_local,
lp);
Expand Down
Loading