-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSwitchML.msg
107 lines (97 loc) · 2.19 KB
/
SwitchML.msg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
struct TensorKey {
uint64_t layer;
uint64_t job_id;
}
cplusplus(h) {{
namespace std {
template<> struct hash<TensorKey> {
std::size_t operator()(const TensorKey &k) const {
size_t lhs = std::hash<uint64_t>()(k.layer);
size_t rhs = std::hash<uint64_t>()(k.job_id);
lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
return lhs;
}
};
}
bool operator==(const TensorKey&, const TensorKey&);
bool operator<(const TensorKey&, const TensorKey&);
}}
cplusplus(cc) {{
bool operator==(const TensorKey& left, const TensorKey& right) {
return left.layer == right.layer && left.job_id == right.job_id;
}
bool operator<(const TensorKey& left, const TensorKey& right) {
// return true when right has smaller layer (higher priority)
return left.layer > right.layer;
}
}}
enum SwitchMLPacketType
{
SWITCHML_CONN_REQ = 0;
SWITCHML_CONN_ACK = 1;
SWITCHML_DISC_REQ = 2;
SWITCHML_DISC_ACK = 3;
SWITCHML_DATA = 4;
};
packet SwitchMLPacket
{
byteLength = 9000;
int from_id;
uint32_t slot;
uint32_t ver;
uint32_t offset;
int priority;
TensorKey tensor_key;
uint32_t n_workers;
uint64_t num_pkts_expected;
uint64_t grad_size;
uint64_t iter;
uint64_t chunk;
bool upward;
}
message HierarchyQuery
{
int path[];
int from_id;
int num_gpus;
cModule* modules[];
}
message Setup
{
uint64_t job_id;
int ids[];
bool top_level;
};
message CollectiveOperationRequest
{
int training_process_id;
int priority = 0;
int worker_id;
bool completed;
short model;
uint64_t size;
uint64_t rank;
TensorKey tensor_key;
uint64_t num_workers_allocated;
uint64_t num_chunks = 1;
uint64_t chunk_id = 0;
simtime_t start;
};
message Job
{
simtime_t submit_time = -1;
simtime_t start_time = -1;
simtime_t finish_time = -1;
short model = 0;
uint64_t job_id = 0;
int worker_id = 0;
uint64_t gpu = 8;
uint64_t rank = 0;
uint32_t num_workers_allocated = 2;
uint64_t iters = 2;
};
message Instruction
{
uint64_t layer = 0;
uint64_t iter = 0;
}