diff --git a/sysfs/class_infiniband.go b/sysfs/class_infiniband.go index 7892c69a..0071ab1a 100644 --- a/sysfs/class_infiniband.go +++ b/sysfs/class_infiniband.go @@ -71,36 +71,129 @@ type InfiniBandCounters struct { // /sys/class/infiniband//ports//hw_counters // for a single port of one InfiniBand device. type InfiniBandHwCounters struct { + ActiveAhs *uint64 // hw_counters/active_ahs + ActiveCqs *uint64 // hw_counters/active_cqs + ActiveMrs *uint64 // hw_counters/active_mrs + ActiveMws *uint64 // hw_counters/active_mws + ActivePds *uint64 // hw_counters/active_pds + ActiveQps *uint64 // hw_counters/active_qps + ActiveRcQps *uint64 // hw_counters/active_rc_qps + ActiveSrqs *uint64 // hw_counters/active_srqs + ActiveUdQps *uint64 // hw_counters/active_ud_qps + BadRespErr *uint64 // hw_counters/bad_resp_err + DbFifoRegister *uint64 // hw_counters/db_fifo_register DuplicateRequest *uint64 // hw_counters/duplicate_request ImpliedNakSeqErr *uint64 // hw_counters/implied_nak_seq_err Lifespan *uint64 // hw_counters/lifespan LocalAckTimeoutErr *uint64 // hw_counters/local_ack_timeout_err + LocalProtectionErr *uint64 // hw_counters/local_protection_err + LocalQpOpErr *uint64 // hw_counters/local_qp_op_err + MaxRetryExceeded *uint64 // hw_counters/max_retry_exceeded + MemMgmtOpErr *uint64 // hw_counters/mem_mgmt_op_err + MissingResp *uint64 // hw_counters/missing_resp NpCnpSent *uint64 // hw_counters/np_cnp_sent NpEcnMarkedRocePackets *uint64 // hw_counters/np_ecn_marked_roce_packets + OosDropCount *uint64 // hw_counters/oos_drop_count OutOfBuffer *uint64 // hw_counters/out_of_buffer OutOfSequence *uint64 // hw_counters/out_of_sequence + PacingAlerts *uint64 // hw_counters/pacing_alerts + PacingComplete *uint64 // hw_counters/pacing_complete + PacingReschedule *uint64 // hw_counters/pacing_reschedule PacketSeqErr *uint64 // hw_counters/packet_seq_err + RecoverableErrors *uint64 // hw_counters/recoverable_errors + RemoteAccessErr *uint64 // hw_counters/remote_access_err + RemoteInvalidReqErr *uint64 // hw_counters/remote_invalid_req_err + RemoteOpErr *uint64 // hw_counters/remote_op_err ReqCqeError *uint64 // hw_counters/req_cqe_error ReqCqeFlushError *uint64 // hw_counters/req_cqe_flush_error ReqRemoteAccessErrors *uint64 // hw_counters/req_remote_access_errors ReqRemoteInvalidRequest *uint64 // hw_counters/req_remote_invalid_request + ResCmpErr *uint64 // hw_counters/res_cmp_err + ResCqLoadErr *uint64 // hw_counters/res_cq_load_err + ResExceedMax *uint64 // hw_counters/res_exceed_max + ResExceedsWqe *uint64 // hw_counters/res_exceeds_wqe + ResInvalidDupRkey *uint64 // hw_counters/res_invalid_dup_rkey + ResIrrqOflow *uint64 // hw_counters/res_irrq_oflow + ResizeCqCnt *uint64 // hw_counters/resize_cq_cnt + ResLengthMismatch *uint64 // hw_counters/res_length_mismatch + ResMemErr *uint64 // hw_counters/res_mem_err + ResOpcodeErr *uint64 // hw_counters/res_opcode_err RespCqeError *uint64 // hw_counters/resp_cqe_error RespCqeFlushError *uint64 // hw_counters/resp_cqe_flush_error RespLocalLengthError *uint64 // hw_counters/resp_local_length_error RespRemoteAccessErrors *uint64 // hw_counters/resp_remote_access_errors + ResRemInvErr *uint64 // hw_counters/res_rem_inv_err + ResRxDomainErr *uint64 // hw_counters/res_rx_domain_err + ResRxInvalidRkey *uint64 // hw_counters/res_rx_invalid_rkey + ResRxNoPerm *uint64 // hw_counters/res_rx_no_perm + ResRxPciErr *uint64 // hw_counters/res_rx_pci_err + ResRxRangeErr *uint64 // hw_counters/res_rx_range_err + ResSrqErr *uint64 // hw_counters/res_srq_err + ResSrqLoadErr *uint64 // hw_counters/res_srq_load_err + ResTxDomainErr *uint64 // hw_counters/res_tx_domain_err + ResTxInvalidRkey *uint64 // hw_counters/res_tx_invalid_rkey + ResTxNoPerm *uint64 // hw_counters/res_tx_no_perm + ResTxPciErr *uint64 // hw_counters/res_tx_pci_err + ResTxRangeErr *uint64 // hw_counters/res_tx_range_err + ResUnalignedAtomic *uint64 // hw_counters/res_unaligned_atomic + ResUnsupOpcode *uint64 // hw_counters/res_unsup_opcode + ResWqeFormatErr *uint64 // hw_counters/res_wqe_format_err RnrNakRetryErr *uint64 // hw_counters/rnr_nak_retry_err - RoceAdpRetrans *uint64 // hw_counters/roce_adp_retrans + RnrNaksRcvd *uint64 // hw_counters/rnr_naks_rcvd RoceAdpRetransTo *uint64 // hw_counters/roce_adp_retrans_to - RoceSlowRestart *uint64 // hw_counters/roce_slow_restart + RoceAdpRetrans *uint64 // hw_counters/roce_adp_retrans RoceSlowRestartCnps *uint64 // hw_counters/roce_slow_restart_cnps RoceSlowRestartTrans *uint64 // hw_counters/roce_slow_restart_trans + RoceSlowRestart *uint64 // hw_counters/roce_slow_restart RpCnpHandled *uint64 // hw_counters/rp_cnp_handled RpCnpIgnored *uint64 // hw_counters/rp_cnp_ignored RxAtomicRequests *uint64 // hw_counters/rx_atomic_requests + RxAtomicReq *uint64 // hw_counters/rx_atomic_req + RxBytes *uint64 // hw_counters/rx_bytes + RxCnpPkts *uint64 // hw_counters/rx_cnp_pkts RxDctConnect *uint64 // hw_counters/rx_dct_connect + RxEcnMarkedPkts *uint64 // hw_counters/rx_ecn_marked_pkts + RxGoodBytes *uint64 // hw_counters/rx_good_bytes + RxGoodPkts *uint64 // hw_counters/rx_good_pkts RxIcrcEncapsulated *uint64 // hw_counters/rx_icrc_encapsulated + RxOutOfBuffer *uint64 // hw_counters/rx_out_of_buffer + RxPkts *uint64 // hw_counters/rx_pkts RxReadRequests *uint64 // hw_counters/rx_read_requests + RxReadReq *uint64 // hw_counters/rx_read_req + RxReadResp *uint64 // hw_counters/rx_read_resp + RxRoceDiscards *uint64 // hw_counters/rx_roce_discards + RxRoceErrors *uint64 // hw_counters/rx_roce_errors + RxRoceGoodBytes *uint64 // hw_counters/rx_roce_good_bytes + RxRoceGoodPkts *uint64 // hw_counters/rx_roce_good_pkts + RxRoceOnlyBytes *uint64 // hw_counters/rx_roce_only_bytes + RxRoceOnlyPkts *uint64 // hw_counters/rx_roce_only_pkts + RxSendReq *uint64 // hw_counters/rx_send_req RxWriteRequests *uint64 // hw_counters/rx_write_requests + RxWriteReq *uint64 // hw_counters/rx_write_req + SeqErrNaksRcvd *uint64 // hw_counters/seq_err_naks_rcvd + ToRetransmits *uint64 // hw_counters/to_retransmits + TxAtomicReq *uint64 // hw_counters/tx_atomic_req + TxBytes *uint64 // hw_counters/tx_bytes + TxCnpPkts *uint64 // hw_counters/tx_cnp_pkts + TxPkts *uint64 // hw_counters/tx_pkts + TxReadReq *uint64 // hw_counters/tx_read_req + TxReadResp *uint64 // hw_counters/tx_read_resp + TxRoceDiscards *uint64 // hw_counters/tx_roce_discards + TxRoceErrors *uint64 // hw_counters/tx_roce_errors + TxRoceOnlyBytes *uint64 // hw_counters/tx_roce_only_bytes + TxRoceOnlyPkts *uint64 // hw_counters/tx_roce_only_pkts + TxSendReq *uint64 // hw_counters/tx_send_req + TxWriteReq *uint64 // hw_counters/tx_write_req + UnrecoverableErr *uint64 // hw_counters/unrecoverable_err + WatermarkAhs *uint64 // hw_counters/watermark_ahs + WatermarkCqs *uint64 // hw_counters/watermark_cqs + WatermarkMrs *uint64 // hw_counters/watermark_mrs + WatermarkMws *uint64 // hw_counters/watermark_mws + WatermarkPds *uint64 // hw_counters/watermark_pds + WatermarkQps *uint64 // hw_counters/watermark_qps + WatermarkRcQps *uint64 // hw_counters/watermark_rc_qps + WatermarkSrqs *uint64 // hw_counters/watermark_srqs + WatermarkUdQps *uint64 // hw_counters/watermark_ud_qps } // InfiniBandPort contains info from files in @@ -280,7 +373,9 @@ func (fs FS) parseInfiniBandPort(name string, port string) (*InfiniBandPort, err } // Intel irdma module does not expose /sys/class/infiniband//ports//counters - if !strings.HasPrefix(ibp.Name, "irdma") { + // Broadcom bnxt_re module exposes /sys/class/infiniband//ports//counters but + // returns "Invalid argument" for all + if !strings.HasPrefix(ibp.Name, "irdma") && !strings.HasPrefix(ibp.Name, "bnxt_re") { counters, err := parseInfiniBandCounters(portPath) if err != nil { return nil, err @@ -288,7 +383,7 @@ func (fs FS) parseInfiniBandPort(name string, port string) (*InfiniBandPort, err ibp.Counters = *counters } - if strings.HasPrefix(ibp.Name, "irdma") || strings.HasPrefix(ibp.Name, "mlx5_") { + if strings.HasPrefix(ibp.Name, "irdma") || strings.HasPrefix(ibp.Name, "mlx5_") || strings.HasPrefix(ibp.Name, "bnxt_re") { hwCounters, err := parseInfiniBandHwCounters(portPath) if err != nil { return nil, err @@ -491,24 +586,74 @@ func parseInfiniBandHwCounters(portPath string) (*InfiniBandHwCounters, error) { vp := util.NewValueParser(value) switch f.Name() { + case "active_ahs": + hwCounters.ActiveAhs = vp.PUInt64() + case "active_cqs": + hwCounters.ActiveCqs = vp.PUInt64() + case "active_mrs": + hwCounters.ActiveMrs = vp.PUInt64() + case "active_mws": + hwCounters.ActiveMws = vp.PUInt64() + case "active_pds": + hwCounters.ActivePds = vp.PUInt64() + case "active_qps": + hwCounters.ActiveQps = vp.PUInt64() + case "active_rc_qps": + hwCounters.ActiveRcQps = vp.PUInt64() + case "active_srqs": + hwCounters.ActiveSrqs = vp.PUInt64() + case "active_ud_qps": + hwCounters.ActiveUdQps = vp.PUInt64() + case "bad_resp_err": + hwCounters.BadRespErr = vp.PUInt64() + case "db_fifo_register": + hwCounters.DbFifoRegister = vp.PUInt64() case "duplicate_request": hwCounters.DuplicateRequest = vp.PUInt64() + case "dup_req": // Broadcom bnxt_re reports as this, it has to be the same as duplicate_request, right? + hwCounters.DuplicateRequest = vp.PUInt64() case "implied_nak_seq_err": hwCounters.ImpliedNakSeqErr = vp.PUInt64() case "lifespan": hwCounters.Lifespan = vp.PUInt64() case "local_ack_timeout_err": hwCounters.LocalAckTimeoutErr = vp.PUInt64() + case "local_protection_err": + hwCounters.LocalProtectionErr = vp.PUInt64() + case "local_qp_op_err": + hwCounters.LocalQpOpErr = vp.PUInt64() + case "max_retry_exceeded": + hwCounters.MaxRetryExceeded = vp.PUInt64() + case "mem_mgmt_op_err": + hwCounters.MemMgmtOpErr = vp.PUInt64() + case "missing_resp": + hwCounters.MissingResp = vp.PUInt64() case "np_cnp_sent": hwCounters.NpCnpSent = vp.PUInt64() case "np_ecn_marked_roce_packets": hwCounters.NpEcnMarkedRocePackets = vp.PUInt64() + case "oos_drop_count": + hwCounters.OosDropCount = vp.PUInt64() case "out_of_buffer": hwCounters.OutOfBuffer = vp.PUInt64() case "out_of_sequence": hwCounters.OutOfSequence = vp.PUInt64() + case "pacing_alerts": + hwCounters.PacingAlerts = vp.PUInt64() + case "pacing_complete": + hwCounters.PacingComplete = vp.PUInt64() + case "pacing_reschedule": + hwCounters.PacingReschedule = vp.PUInt64() case "packet_seq_err": hwCounters.PacketSeqErr = vp.PUInt64() + case "recoverable_errors": + hwCounters.RecoverableErrors = vp.PUInt64() + case "remote_access_err": + hwCounters.RemoteAccessErr = vp.PUInt64() + case "remote_invalid_req_err": + hwCounters.RemoteInvalidReqErr = vp.PUInt64() + case "remote_op_err": + hwCounters.RemoteOpErr = vp.PUInt64() case "req_cqe_error": hwCounters.ReqCqeError = vp.PUInt64() case "req_cqe_flush_error": @@ -517,6 +662,26 @@ func parseInfiniBandHwCounters(portPath string) (*InfiniBandHwCounters, error) { hwCounters.ReqRemoteAccessErrors = vp.PUInt64() case "req_remote_invalid_request": hwCounters.ReqRemoteInvalidRequest = vp.PUInt64() + case "res_cmp_err": + hwCounters.ResCmpErr = vp.PUInt64() + case "res_cq_load_err": + hwCounters.ResCqLoadErr = vp.PUInt64() + case "res_exceed_max": + hwCounters.ResExceedMax = vp.PUInt64() + case "res_exceeds_wqe": + hwCounters.ResExceedsWqe = vp.PUInt64() + case "res_invalid_dup_rkey": + hwCounters.ResInvalidDupRkey = vp.PUInt64() + case "res_irrq_oflow": + hwCounters.ResIrrqOflow = vp.PUInt64() + case "resize_cq_cnt": + hwCounters.ResizeCqCnt = vp.PUInt64() + case "res_length_mismatch": + hwCounters.ResLengthMismatch = vp.PUInt64() + case "res_mem_err": + hwCounters.ResMemErr = vp.PUInt64() + case "res_opcode_err": + hwCounters.ResOpcodeErr = vp.PUInt64() case "resp_cqe_error": hwCounters.RespCqeError = vp.PUInt64() case "resp_cqe_flush_error": @@ -525,32 +690,150 @@ func parseInfiniBandHwCounters(portPath string) (*InfiniBandHwCounters, error) { hwCounters.RespLocalLengthError = vp.PUInt64() case "resp_remote_access_errors": hwCounters.RespRemoteAccessErrors = vp.PUInt64() + case "res_rem_inv_err": + hwCounters.ResRemInvErr = vp.PUInt64() + case "res_rx_domain_err": + hwCounters.ResRxDomainErr = vp.PUInt64() + case "res_rx_invalid_rkey": + hwCounters.ResRxInvalidRkey = vp.PUInt64() + case "res_rx_no_perm": + hwCounters.ResRxNoPerm = vp.PUInt64() + case "res_rx_pci_err": + hwCounters.ResRxPciErr = vp.PUInt64() + case "res_rx_range_err": + hwCounters.ResRxRangeErr = vp.PUInt64() + case "res_srq_err": + hwCounters.ResSrqErr = vp.PUInt64() + case "res_srq_load_err": + hwCounters.ResSrqLoadErr = vp.PUInt64() + case "res_tx_domain_err": + hwCounters.ResTxDomainErr = vp.PUInt64() + case "res_tx_invalid_rkey": + hwCounters.ResTxInvalidRkey = vp.PUInt64() + case "res_tx_no_perm": + hwCounters.ResTxNoPerm = vp.PUInt64() + case "res_tx_pci_err": + hwCounters.ResTxPciErr = vp.PUInt64() + case "res_tx_range_err": + hwCounters.ResTxRangeErr = vp.PUInt64() + case "res_unaligned_atomic": + hwCounters.ResUnalignedAtomic = vp.PUInt64() + case "res_unsup_opcode": + hwCounters.ResUnsupOpcode = vp.PUInt64() + case "res_wqe_format_err": + hwCounters.ResWqeFormatErr = vp.PUInt64() case "rnr_nak_retry_err": hwCounters.RnrNakRetryErr = vp.PUInt64() - case "roce_adp_retrans": - hwCounters.RoceAdpRetrans = vp.PUInt64() + case "rnr_naks_rcvd": + hwCounters.RnrNaksRcvd = vp.PUInt64() case "roce_adp_retrans_to": hwCounters.RoceAdpRetransTo = vp.PUInt64() - case "roce_slow_restart": - hwCounters.RoceSlowRestart = vp.PUInt64() + case "roce_adp_retrans": + hwCounters.RoceAdpRetrans = vp.PUInt64() case "roce_slow_restart_cnps": hwCounters.RoceSlowRestartCnps = vp.PUInt64() case "roce_slow_restart_trans": hwCounters.RoceSlowRestartTrans = vp.PUInt64() + case "roce_slow_restart": + hwCounters.RoceSlowRestart = vp.PUInt64() case "rp_cnp_handled": hwCounters.RpCnpHandled = vp.PUInt64() case "rp_cnp_ignored": hwCounters.RpCnpIgnored = vp.PUInt64() case "rx_atomic_requests": hwCounters.RxAtomicRequests = vp.PUInt64() + case "rx_atomic_req": + hwCounters.RxAtomicReq = vp.PUInt64() + case "rx_bytes": + hwCounters.RxBytes = vp.PUInt64() + case "rx_cnp_pkts": + hwCounters.RxCnpPkts = vp.PUInt64() case "rx_dct_connect": hwCounters.RxDctConnect = vp.PUInt64() + case "rx_ecn_marked_pkts": + hwCounters.RxEcnMarkedPkts = vp.PUInt64() + case "rx_good_bytes": + hwCounters.RxGoodBytes = vp.PUInt64() + case "rx_good_pkts": + hwCounters.RxGoodPkts = vp.PUInt64() case "rx_icrc_encapsulated": hwCounters.RxIcrcEncapsulated = vp.PUInt64() + case "rx_out_of_buffer": + hwCounters.RxOutOfBuffer = vp.PUInt64() + case "rx_pkts": + hwCounters.RxPkts = vp.PUInt64() case "rx_read_requests": hwCounters.RxReadRequests = vp.PUInt64() + case "rx_read_req": + hwCounters.RxReadReq = vp.PUInt64() + case "rx_read_resp": + hwCounters.RxReadResp = vp.PUInt64() + case "rx_roce_discards": + hwCounters.RxRoceDiscards = vp.PUInt64() + case "rx_roce_errors": + hwCounters.RxRoceErrors = vp.PUInt64() + case "rx_roce_good_bytes": + hwCounters.RxRoceGoodBytes = vp.PUInt64() + case "rx_roce_good_pkts": + hwCounters.RxRoceGoodPkts = vp.PUInt64() + case "rx_roce_only_bytes": + hwCounters.RxRoceOnlyBytes = vp.PUInt64() + case "rx_roce_only_pkts": + hwCounters.RxRoceOnlyPkts = vp.PUInt64() + case "rx_send_req": + hwCounters.RxSendReq = vp.PUInt64() case "rx_write_requests": hwCounters.RxWriteRequests = vp.PUInt64() + case "rx_write_req": + hwCounters.RxWriteReq = vp.PUInt64() + case "seq_err_naks_rcvd": + hwCounters.SeqErrNaksRcvd = vp.PUInt64() + case "to_retransmits": + hwCounters.ToRetransmits = vp.PUInt64() + case "tx_atomic_req": + hwCounters.TxAtomicReq = vp.PUInt64() + case "tx_bytes": + hwCounters.TxBytes = vp.PUInt64() + case "tx_cnp_pkts": + hwCounters.TxCnpPkts = vp.PUInt64() + case "tx_pkts": + hwCounters.TxPkts = vp.PUInt64() + case "tx_read_req": + hwCounters.TxReadReq = vp.PUInt64() + case "tx_read_resp": + hwCounters.TxReadResp = vp.PUInt64() + case "tx_roce_discards": + hwCounters.TxRoceDiscards = vp.PUInt64() + case "tx_roce_errors": + hwCounters.TxRoceErrors = vp.PUInt64() + case "tx_roce_only_bytes": + hwCounters.TxRoceOnlyBytes = vp.PUInt64() + case "tx_roce_only_pkts": + hwCounters.TxRoceOnlyPkts = vp.PUInt64() + case "tx_send_req": + hwCounters.TxSendReq = vp.PUInt64() + case "tx_write_req": + hwCounters.TxWriteReq = vp.PUInt64() + case "unrecoverable_err": + hwCounters.UnrecoverableErr = vp.PUInt64() + case "watermark_ahs": + hwCounters.WatermarkAhs = vp.PUInt64() + case "watermark_cqs": + hwCounters.WatermarkCqs = vp.PUInt64() + case "watermark_mrs": + hwCounters.WatermarkMrs = vp.PUInt64() + case "watermark_mws": + hwCounters.WatermarkMws = vp.PUInt64() + case "watermark_pds": + hwCounters.WatermarkPds = vp.PUInt64() + case "watermark_qps": + hwCounters.WatermarkQps = vp.PUInt64() + case "watermark_rc_qps": + hwCounters.WatermarkRcQps = vp.PUInt64() + case "watermark_srqs": + hwCounters.WatermarkSrqs = vp.PUInt64() + case "watermark_ud_qps": + hwCounters.WatermarkUdQps = vp.PUInt64() } if err := vp.Err(); err != nil {