Skip to content

Commit

Permalink
Fix snmp agent not-responding issue when high CPU utilization
Browse files Browse the repository at this point in the history
  • Loading branch information
yejianquan committed Jan 2, 2025
1 parent 6a5c96d commit 97145ea
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 1 deletion.
5 changes: 5 additions & 0 deletions src/ax_interface/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,8 @@ class PduTypes(int, Enum):


DEFAULT_PDU_TIMEOUT = 5

# MIBUpdater rate: Interval/Execution time
UPDATE_FREQUENCY_RATE = 10
# MIBUpdater max update interval
MAX_UPDATE_INTERVAL = 60
30 changes: 29 additions & 1 deletion src/ax_interface/mib.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import asyncio
import bisect
import random
from datetime import datetime

from . import logger, util
from .constants import ValueType
from .encodings import ValueRepresentation
from .util import get_next_update_interval

"""
Update interval between update runs (in seconds).
Expand Down Expand Up @@ -32,6 +34,7 @@ async def start(self):
# Run the update while we are allowed
redis_exception_happen = False
while self.run_event.is_set():
start = datetime.now()
try:
# reinit internal structures
if self.update_counter > self.reinit_rate:
Expand All @@ -57,9 +60,34 @@ async def start(self):
# Any unexpected exception or error, log it and keep running
logger.exception("MIBUpdater.start() caught an unexpected exception during update_data()")

"""
On SONiC device with huge interfaces
for example RP card on ethernet chassis, including backend ports, 600+ interfaces
The update_data function could be very slow, especially when 100% CPU utilization.
for example ciscoSwitchQosMIB.QueueStatUpdater, uses 1-3 seconds on normal state.
uses 3-8 seconds on 100% CPU utilization state.
We use Asyncio/Coroutine as the basic framework,
the mib updaters share the same asyncio event loop with the SNMP agent client.
Hence during the updaters executing, the agent client can't receive/respond to new requests,
The high frequency and the long execution time
causes the SNMP request to be timed out on High CPU utilization.
The stable frequency(generally with default value 5s)
doesn't works well on this huge interfaces situation.
when the execution time is long,
wait for longer time to give back the control of asyncio event loop to SNMP agent
"""

execution_time = (datetime.now() - start).total_seconds()
next_frequency = get_next_update_interval(execution_time, self.frequency)

if next_frequency > self.frequency:
logger.debug(f"MIBUpdater type[{type(self)}] slow update detected, "
f"update execution time[{execution_time}], next_frequency[{next_frequency}]")

# wait based on our update frequency before executing again.
# randomize to avoid concurrent update storms.
await asyncio.sleep(self.frequency + random.randint(-2, 2))
await asyncio.sleep(next_frequency + random.randint(-2, 2))

def reinit_data(self):
"""
Expand Down
29 changes: 29 additions & 0 deletions src/ax_interface/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import ipaddress
import math
import re

from ax_interface import constants
Expand Down Expand Up @@ -108,3 +109,31 @@ def ip2byte_tuple(ip):
"""
return tuple(i for i in ipaddress.ip_address(ip).packed)


def get_next_update_interval(execution_time, static_frequency):
"""
>>> get_next_update_interval(0.4, 5)
5
>>> get_next_update_interval(0.87, 5)
9
>>> get_next_update_interval(18.88, 5)
60
:param static_frequency: Static frequency, generally use default value 5
:param execution_time: The execution time of the updater
:return: the interval before next update
We expect the rate of 'update interval'/'update execution time' >= UPDATE_FREQUENCY_RATE(10)
Because we're using asyncio/Coroutines, the update execution blocks SNMP proxy service and other updaters.
Generally we expect the update to be quick and the execution time/interval time < 0.25
Given the static_frequency == 5,
if the execution_time < 0.5,
the update interval is(for example) 1.1s
It sleeps 1.1s * 10 = 11s before run next update
"""
frequency_based_on_execution_time = math.ceil(execution_time * constants.UPDATE_FREQUENCY_RATE)
frequency_based_on_execution_time = min(frequency_based_on_execution_time, constants.MAX_UPDATE_INTERVAL)

return max(static_frequency, frequency_based_on_execution_time)
54 changes: 54 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from unittest import TestCase

from ax_interface.util import get_next_update_interval


class TestUtil(TestCase):
# Given: Update is quick, execution time is 0.000001, static interval is 5/10/15s
# When: get next interval
# Then: Return default interval, 5/10/15s
def test_get_interval_quick_finish(self):
for static_interval in [5, 10, 15]:
self.assertEqual(get_next_update_interval(0.000001, static_interval), static_interval)

# Given: Update is slow, execution time is 0.7666666, static interval is 5
# When: get next interval
# Then: Return the ceil(0.766666 * 10) = 8
def test_get_interval_slow_finish(self):
self.assertEqual(get_next_update_interval(0.766666, 5), 8)

# Given: Update is slow, execution time is 0.766666, static interval is 10
# When: get next interval
# Then: Return default interval, 10
def test_get_interval_slow_finish_default_long(self):
self.assertEqual(get_next_update_interval(0.766666, 10), 10)

# Given: Update is very slow, execution time is 20.2324, static interval is 10
# When: get next interval
# Then: Return max interval, 60
def test_get_interval_very_slow(self):
self.assertEqual(get_next_update_interval(20.2324, 10), 60)

# Given: Get a 0 as the execution time, static interval is 5
# When: get next interval
# Then: Return default interval, 5
def test_get_interval_zero(self):
self.assertEqual(get_next_update_interval(0, 5), 5)

# Given: Get a 0.000000 as the execution time, static interval is 5
# When: get next interval
# Then: Return default interval, 5
def test_get_interval_zero_long(self):
self.assertEqual(get_next_update_interval(0.000000, 5), 5)

# Given: Wrongly get a negative number(-0.000001) as the execution time, static interval is 5
# When: get next interval
# Then: Return default interval, 5
def test_get_interval_negative(self):
self.assertEqual(get_next_update_interval(-0.000001, 5), 5)

# Given: Wrongly get a negative number(-10.000001) as the execution time, static interval is 5
# When: get next interval
# Then: Return default interval, 5
def test_get_interval_negative_slow(self):
self.assertEqual(get_next_update_interval(-10.000001, 5), 5)

0 comments on commit 97145ea

Please sign in to comment.