Skip to content

Commit

Permalink
Support version skew between Antrea Agent and Flow Aggregator (#6912)
Browse files Browse the repository at this point in the history
When a new IPFIX Information Element (IE) is introduced, a version
mismatch between the Agent and the Flow Aggregator can be
problematic. A "new" Agent can send an IE which is unknown to the "old"
Flow Aggregator, or the "new" Flow Aggregator may expect an IE which is
not sent by an "old" Agent.

Prior to this change, we required the list of IEs sent by the Agent to
be the same as the list of IEs expected by the Flow Aggregator. This is
impossible to ensure during upgrade, as it may take a long time for all
Agents in the cluster to be upgraded.

After this change, Agents and Flow Aggregator can be upgraded in any
order (although we would recommend the Flow Aggregator to be upgraded
last). To achieve this, we introduce a new "process" between IPFIX
collection and aggregation in the Flow Aggregator: the
"preprocessor". The preprocessor is in charge of processing messages
received from the IPFIX collector, prior to handling records over to the
aggregation process. At the moment, its only task is to ensure that all
records have the expected fields. If a record has extra fields, they
will be discarded. If some fields are missing, they will be "appended"
to the record with a "zero" value. For example, we will use 0 for
integral types, "" for strings, 0.0.0.0 for IPv4 address, etc. Note that
we are able to keep the implementation simple by assuming that a record
either has missing fields or extra fields (not a combination of both),
and that such fields are always at the tail of the field list. This
assumption is based on implementation knowledge of the FlowExporter and
the FlowAggregator. When we introduce a new IE, it always comes after
all existing IEs, and we never deprecate / remove an existing IE across
versions.

Note that when the preprocessor adds a missing field, it is no longer
possible to determine whether the field was originally missing, or was
sent by the Agent with a zero value. This is why we recommend upgrading
the Flow Aggregator last (to avoid this situation altogether). However,
we do not believe that it is a significant drawback based on current
usage.

Fixes #6777

Signed-off-by: Antonin Bas <[email protected]>
  • Loading branch information
antoninbas authored Jan 10, 2025
1 parent afb9dfc commit 0efb397
Show file tree
Hide file tree
Showing 6 changed files with 374 additions and 15 deletions.
2 changes: 1 addition & 1 deletion pkg/flowaggregator/clickhouseclient/clickhouseclient.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ const (
egressIP,
appProtocolName,
httpVals,
egressNodeName)
egressNodeName)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
?, ?, ?, ?, ?)`
Expand Down
4 changes: 3 additions & 1 deletion pkg/flowaggregator/exporter/ipfix.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,9 @@ func (e *IPFIXExporter) sendRecord(record ipfixentities.Record, isRecordIPv6 boo
if err != nil {
return err
}
klog.V(4).InfoS("Data set sent successfully", "bytes sent", sentBytes)
if klog.V(7).Enabled() {
klog.InfoS("Data set sent successfully", "bytes sent", sentBytes)
}
return nil
}

Expand Down
84 changes: 76 additions & 8 deletions pkg/flowaggregator/flowaggregator.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ type flowAggregator struct {
clusterUUID uuid.UUID
aggregatorTransportProtocol flowaggregatorconfig.AggregatorTransportProtocol
collectingProcess ipfix.IPFIXCollectingProcess
preprocessor *preprocessor
aggregationProcess ipfix.IPFIXAggregationProcess
activeFlowRecordTimeout time.Duration
inactiveFlowRecordTimeout time.Duration
Expand Down Expand Up @@ -175,13 +176,18 @@ func NewFlowAggregator(
APIServer: opt.Config.APIServer,
logTickerDuration: time.Minute,
}
err = fa.InitCollectingProcess()
if err != nil {
return nil, fmt.Errorf("error when creating collecting process: %v", err)
if err := fa.InitCollectingProcess(); err != nil {
return nil, fmt.Errorf("error when creating collecting process: %w", err)
}
err = fa.InitAggregationProcess()
if err != nil {
return nil, fmt.Errorf("error when creating aggregation process: %v", err)
// Use a buffered channel which ideally should be large enough to accommodate all the records
// included in a given IPFIX message. It would be unusual to have more than 128 records in
// an IPFIX message.
recordCh := make(chan ipfixentities.Record, 128)
if err := fa.InitPreprocessor(recordCh); err != nil {
return nil, fmt.Errorf("error when creating preprocessor: %w", err)
}
if err := fa.InitAggregationProcess(recordCh); err != nil {
return nil, fmt.Errorf("error when creating aggregation process: %w", err)
}
if opt.Config.ClickHouse.Enable {
var err error
Expand Down Expand Up @@ -261,15 +267,72 @@ func (fa *flowAggregator) InitCollectingProcess() error {
len(infoelements.AntreaFlowEndSecondsElementList) + len(infoelements.AntreaThroughputElementList) + len(infoelements.AntreaSourceThroughputElementList) + len(infoelements.AntreaDestinationThroughputElementList)
// clusterId
cpInput.NumExtraElements += 1
// Tell the collector to accept IEs which are not part of the IPFIX registry (hardcoded in
// the go-ipfix library). The preprocessor will take care of removing these elements.
cpInput.DecodingMode = collector.DecodingModeLenientKeepUnknown
var err error
fa.collectingProcess, err = collector.InitCollectingProcess(cpInput)
return err
}

func (fa *flowAggregator) InitAggregationProcess() error {
func (fa *flowAggregator) InitPreprocessor(recordCh chan<- ipfixentities.Record) error {
getInfoElementFromRegistry := func(ieName string, enterpriseID uint32) (*ipfixentities.InfoElement, error) {
ie, err := fa.registry.GetInfoElement(ieName, enterpriseID)
if err != nil {
return nil, fmt.Errorf("error when looking up IE %q in registry: %w", ieName, err)
}
return ie, err
}

getInfoElements := func(isIPv4 bool) ([]*ipfixentities.InfoElement, error) {
ianaInfoElements := infoelements.IANAInfoElementsIPv4
ianaReverseInfoElements := infoelements.IANAReverseInfoElements
antreaInfoElements := infoelements.AntreaInfoElementsIPv4
if !isIPv4 {
ianaInfoElements = infoelements.IANAInfoElementsIPv6
antreaInfoElements = infoelements.AntreaInfoElementsIPv6
}
infoElements := make([]*ipfixentities.InfoElement, 0)
for _, ieName := range ianaInfoElements {
ie, err := getInfoElementFromRegistry(ieName, ipfixregistry.IANAEnterpriseID)
if err != nil {
return nil, err
}
infoElements = append(infoElements, ie)
}
for _, ieName := range ianaReverseInfoElements {
ie, err := getInfoElementFromRegistry(ieName, ipfixregistry.IANAReversedEnterpriseID)
if err != nil {
return nil, err
}
infoElements = append(infoElements, ie)
}
for _, ieName := range antreaInfoElements {
ie, err := getInfoElementFromRegistry(ieName, ipfixregistry.AntreaEnterpriseID)
if err != nil {
return nil, err
}
infoElements = append(infoElements, ie)
}
return infoElements, nil
}

infoElementsIPv4, err := getInfoElements(true)
if err != nil {
return err
}
infoElementsIPv6, err := getInfoElements(false)
if err != nil {
return err
}
fa.preprocessor, err = newPreprocessor(infoElementsIPv4, infoElementsIPv6, fa.collectingProcess.GetMsgChan(), recordCh)
return err
}

func (fa *flowAggregator) InitAggregationProcess(recordCh <-chan ipfixentities.Record) error {
var err error
apInput := ipfixintermediate.AggregationInput{
MessageChan: fa.collectingProcess.GetMsgChan(),
RecordChan: recordCh,
WorkerNum: aggregationWorkerNum,
CorrelateFields: correlateFields,
ActiveExpiryTimeout: fa.activeFlowRecordTimeout,
Expand All @@ -293,6 +356,11 @@ func (fa *flowAggregator) Run(stopCh <-chan struct{}) {
fa.collectingProcess.Start()
}()
ipfixProcessesWg.Add(1)
go func() {
defer ipfixProcessesWg.Done()
fa.preprocessor.Run(stopCh)
}()
ipfixProcessesWg.Add(1)
go func() {
// Same comment as above.
defer ipfixProcessesWg.Done()
Expand Down
11 changes: 6 additions & 5 deletions pkg/flowaggregator/flowaggregator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,7 @@ func TestFlowAggregator_Run(t *testing.T) {
activeFlowRecordTimeout: 1 * time.Hour,
logTickerDuration: 1 * time.Hour,
collectingProcess: mockCollectingProcess,
preprocessor: &preprocessor{},
aggregationProcess: mockAggregationProcess,
ipfixExporter: mockIPFIXExporter,
configWatcher: configWatcher,
Expand Down Expand Up @@ -858,12 +859,12 @@ func TestFlowAggregator_InitAggregationProcess(t *testing.T) {
activeFlowRecordTimeout: testActiveTimeout,
inactiveFlowRecordTimeout: testInactiveTimeout,
aggregatorTransportProtocol: flowaggregatorconfig.AggregatorTransportProtocolTCP,
registry: ipfix.NewIPFIXRegistry(),
}
err := fa.InitCollectingProcess()
require.NoError(t, err)

err = fa.InitAggregationProcess()
require.NoError(t, err)
require.NoError(t, fa.InitCollectingProcess())
recordCh := make(chan ipfixentities.Record)
require.NoError(t, fa.InitPreprocessor(recordCh))
require.NoError(t, fa.InitAggregationProcess(recordCh))
}

func TestFlowAggregator_fillK8sMetadata(t *testing.T) {
Expand Down
178 changes: 178 additions & 0 deletions pkg/flowaggregator/preprocessor.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
// Copyright 2025 Antrea Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package flowaggregator

import (
"fmt"
"net"

"github.com/vmware/go-ipfix/pkg/entities"
"k8s.io/klog/v2"
)

// preprocessor is in charge of processing messages received from the IPFIX collector, prior to
// handling records over to the aggregation process. At the moment, its only task is to ensure that
// all records have the expected fields. If a record has extra fields, they will be discarded. If
// some fields are missing, they will be "appended" to the record with a "zero" value. For example,
// we will use 0 for integral types, "" for strings, 0.0.0.0 for IPv4 address, etc. Note that we are
// able to keep the implementation simple by assuming that a record either has missing fields or
// extra fields (not a combination of both), and that such fields are always at the tail of the
// field list. This assumption is based on implementation knowledge of the FlowExporter and the
// FlowAggregator.
type preprocessor struct {
inCh <-chan *entities.Message
outCh chan<- entities.Record

expectedElementsV4 int
expectedElementsV6 int

defaultElementsWithValueV4 []entities.InfoElementWithValue
defaultElementsWithValueV6 []entities.InfoElementWithValue
}

func makeDefaultElementWithValue(ie *entities.InfoElement) (entities.InfoElementWithValue, error) {
switch ie.DataType {
case entities.OctetArray:
var val []byte
if ie.Len < entities.VariableLength {
val = make([]byte, ie.Len)
}
return entities.NewOctetArrayInfoElement(ie, val), nil
case entities.Unsigned8:
return entities.NewUnsigned8InfoElement(ie, 0), nil
case entities.Unsigned16:
return entities.NewUnsigned16InfoElement(ie, 0), nil
case entities.Unsigned32:
return entities.NewUnsigned32InfoElement(ie, 0), nil
case entities.Unsigned64:
return entities.NewUnsigned64InfoElement(ie, 0), nil
case entities.Signed8:
return entities.NewSigned8InfoElement(ie, 0), nil
case entities.Signed16:
return entities.NewSigned16InfoElement(ie, 0), nil
case entities.Signed32:
return entities.NewSigned32InfoElement(ie, 0), nil
case entities.Signed64:
return entities.NewSigned64InfoElement(ie, 0), nil
case entities.Float32:
return entities.NewFloat32InfoElement(ie, 0), nil
case entities.Float64:
return entities.NewFloat64InfoElement(ie, 0), nil
case entities.Boolean:
return entities.NewBoolInfoElement(ie, false), nil
case entities.DateTimeSeconds:
return entities.NewDateTimeSecondsInfoElement(ie, 0), nil
case entities.DateTimeMilliseconds:
return entities.NewDateTimeMillisecondsInfoElement(ie, 0), nil
case entities.MacAddress:
return entities.NewMacAddressInfoElement(ie, make([]byte, 6)), nil
case entities.Ipv4Address:
return entities.NewIPAddressInfoElement(ie, net.IPv4zero), nil
case entities.Ipv6Address:
return entities.NewIPAddressInfoElement(ie, net.IPv6zero), nil
case entities.String:
return entities.NewStringInfoElement(ie, ""), nil
default:
return nil, fmt.Errorf("unexpected Information Element data type: %d", ie.DataType)
}
}

func makeDefaultElementsWithValue(infoElements []*entities.InfoElement) ([]entities.InfoElementWithValue, error) {
elementsWithValue := make([]entities.InfoElementWithValue, len(infoElements))
for idx := range infoElements {
var err error
if elementsWithValue[idx], err = makeDefaultElementWithValue(infoElements[idx]); err != nil {
return nil, err
}
}
return elementsWithValue, nil
}

func newPreprocessor(infoElementsV4, infoElementsV6 []*entities.InfoElement, inCh <-chan *entities.Message, outCh chan<- entities.Record) (*preprocessor, error) {
defaultElementsWithValueV4, err := makeDefaultElementsWithValue(infoElementsV4)
if err != nil {
return nil, fmt.Errorf("error when generating default values for IPv4 Information Elements expected from exporter: %w", err)
}
defaultElementsWithValueV6, err := makeDefaultElementsWithValue(infoElementsV6)
if err != nil {
return nil, fmt.Errorf("error when generating default values for IPv6 Information Elements expected from exporter: %w", err)
}
return &preprocessor{
inCh: inCh,
outCh: outCh,
expectedElementsV4: len(infoElementsV4),
expectedElementsV6: len(infoElementsV6),
defaultElementsWithValueV4: defaultElementsWithValueV4,
defaultElementsWithValueV6: defaultElementsWithValueV6,
}, nil
}

func (p *preprocessor) Run(stopCh <-chan struct{}) {
for {
select {
case <-stopCh:
return
case msg, ok := <-p.inCh:
if !ok {
return
}
p.processMsg(msg)
}
}
}

func isRecordIPv4(record entities.Record) bool {
_, _, exist := record.GetInfoElementWithValue("sourceIPv4Address")
return exist
}

func (p *preprocessor) processMsg(msg *entities.Message) {
set := msg.GetSet()
if set.GetSetType() != entities.Data {
return
}
records := set.GetRecords()
for _, record := range records {
elementList := record.GetOrderedElementList()
numElements := len(elementList)
isIPv4 := isRecordIPv4(record)
expectedElements := p.expectedElementsV4
if !isIPv4 {
expectedElements = p.expectedElementsV6
}
if numElements == expectedElements {
p.outCh <- record
} else if numElements > expectedElements {
if klog.V(5).Enabled() {
klog.InfoS("Record received from exporter includes unexpected elements, truncating", "expectedElements", expectedElements, "receivedElements", numElements)
}
// Creating a new Record seems like the best option here. By using
// NewDataRecordFromElements, we should minimize the number of allocations
// required.
p.outCh <- entities.NewDataRecordFromElements(0, elementList[:expectedElements], true)
} else {
if klog.V(5).Enabled() {
klog.InfoS("Record received from exporter is missing information elements, adding fields with zero values", "expectedElements", expectedElements, "receivedElements", numElements)
}
if isIPv4 {
elementList = append(elementList, p.defaultElementsWithValueV4[numElements:]...)
} else {
elementList = append(elementList, p.defaultElementsWithValueV6[numElements:]...)
}
p.outCh <- entities.NewDataRecordFromElements(0, elementList, true)
}
}

}
Loading

0 comments on commit 0efb397

Please sign in to comment.