High CPU Usage per Chassis and Blade-radware-alteon-os

error
health-checks
alteon-os
radware
High CPU Usage per Chassis and Blade-radware-alteon-os
0

#1

High CPU Usage per Chassis and Blade-radware-alteon-os

Vendor: radware

OS: alteon-os

Description:
indeni will trigger an issue when CPU usage per chassis and blade is high.

Remediation Steps:
Determine the cause for the high CPU usage of the listed CPUs.

radware-SpStatsCpuUtilTable

#! META
name: radware-SpStatsCpuUtilTable
description: fetch the SP CPU utilization
type: monitoring
monitoring_interval: 1 minute 
requires:
    os.name: "alteon-os"
    vendor: "radware"

#! REMOTE::HTTP
url: /config/SpStatsCpuUtilTable
protocol: HTTPS

#! PARSER::JSON
_metrics:
    -
        _groups:
            "$.['SpStatsCpuUtilTableE*!'][0:]":
                _temp:
                    spIndex:
                        _value: SpIndex
                _value.double:
                    _value: Util1Second
                _tags:
                    "im.name":
                        _constant: "cpu-usage"
                    "live-config":
                       _constant: "true"
                    "display-name":
                        _constant: "CPU Utilization"
                    "im.dstype.displayType":
                        _constant: "percentage"
                    "im.identity-tags":
                        _constant: "cpu-id"
        _transform:
            _tags:
                name: |
                     {
                         print "SP " temp("spIndex")
                     }

high_per_chassis_blade_cpu_usage

package com.indeni.server.rules.library

import com.indeni.apidata.time.TimeSpan
import com.indeni.ruleengine.Scope.{Scope, ScopeValueHelper}
import com.indeni.ruleengine.expressions.Expression
import com.indeni.ruleengine.expressions.conditions.GreaterThanOrEqual
import com.indeni.ruleengine.expressions.core.{StatusTreeExpression, _}
import com.indeni.ruleengine.expressions.data.{SelectTagsExpression, SelectTimeSeriesExpression, TimeSeriesExpression}
import com.indeni.ruleengine.expressions.math.MinExpression
import com.indeni.ruleengine.expressions.scope.ScopableExpression
import com.indeni.server.common.ParameterValue
import com.indeni.server.common.data.conditions.{Equals, True}
import com.indeni.server.params.ParameterDefinition
import com.indeni.server.params.ParameterDefinition.UIType
import com.indeni.server.rules._
import com.indeni.server.rules.config.expressions.DynamicParameterExpression
import com.indeni.server.rules.library.core.PerDeviceRule
import com.indeni.server.sensor.models.managementprocess.alerts.dto.AlertSeverity


/**
  * Created by amir on 03/02/2016.
  */
case class HighPerChassisBladeCpuUsageRule() extends PerDeviceRule {


  private val highThresholdParameterName = "High_Threshold_of_CPU_Usage"
  private val highThresholdParameter = new ParameterDefinition(highThresholdParameterName,
    "",
    "High Threshold of CPU Usage",
    "What is the threshold for the CPU usage for which once it is crossed an issue will be triggered. The CPU usage must be above this threshold constantly for a certain time frame in order for a issue to be triggered.",
    UIType.DOUBLE,
    new ParameterValue((70.0).asInstanceOf[Object])
  )

  private val numOfCpusParameterName = "higher_than_threshold_cpues"
  private val numOfCpusParameter = new ParameterDefinition(numOfCpusParameterName,
    "",
    "Number of CPUs",
    "The number of CPUs with usage above the value set in " + "\"" + highThresholdParameter.getFriendlyName + "\"" + " before a issue is triggered.",
    UIType.INTEGER,
    new ParameterValue((1).asInstanceOf[Object])
  )

  private val reviewedTimeframeParameterName: String = "reviewed_timeframe"

  private val reviewedTimeframeParameter = new ParameterDefinition(reviewedTimeframeParameterName,
    "",
    "Reviewed Timeframe",
    "The CPU usage must be above the value set in " + "\"" + highThresholdParameter.getFriendlyName + "\" for this amount of time before a issue is triggered.",
    UIType.TIMESPAN,
    TimeSpan.fromMinutes(10))

  override val metadata: RuleMetadata = RuleMetadata.builder("high_per_chassis_blade_cpu_usage", "High CPU Usage per Chassis and Blade", "indeni will trigger an issue when CPU usage per chassis and blade is high.", AlertSeverity.ERROR).configParameters(highThresholdParameter, numOfCpusParameter, reviewedTimeframeParameter).build()

  override def expressionTree(context: RuleContext): StatusTreeExpression = {

    val usageHistory = TimeSeriesExpression[Double]("cpu-usage")
    val minimumUsage = MinExpression(usageHistory)
    val usageThreshold = DynamicParameterExpression.withConstantDefault(highThresholdParameter.getName, highThresholdParameter.getDefaultValue.asDouble.toDouble).noneable
    val isUsageAboveThreshold = GreaterThanOrEqual(minimumUsage, usageThreshold)

    val cpuFailDescription = new ScopableExpression[String] {

      override protected def evalWithScope(time: Long, scope: Scope): String =
        "Cpu usage (" + minimumUsage.eval(time).get.round + "%) above threshold (" + usageThreshold.eval(time).get +
          "%) of " + scope.getVisible("cpu-id").get + " of chassis " + scope.getVisible("Chassis").get + ", blade " + scope.getVisible("Blade").get

      override def args: Set[Expression[_]] = Set(minimumUsage, usageThreshold)
    }

    val cpuFailHeadline = new ScopableExpression[String] {
      override protected def evalWithScope(time: Long, scope: Scope): String = "chassis " + scope.getVisible("Chassis").get + ", blade " + scope.getVisible("Blade").get + ", cpu " + scope.getVisible("cpu-id").get

      override def args: Set[Expression[_]] = Set()
    }

    val historyLength = DynamicParameterExpression.withConstantDefault(reviewedTimeframeParameter.getName, reviewedTimeframeParameter.getDefaultValue.asTimeSpan)
    val tsQuery = SelectTimeSeriesExpression[Double](context.tsDao, Set("cpu-usage"), historyLength)

    val forTsCondition = StatusTreeExpression(tsQuery, isUsageAboveThreshold).withSecondaryInfo(
      cpuFailHeadline, cpuFailDescription, title = "Blades with High CPU Usage"
    ).asCondition()

    val cpusQuery = SelectTagsExpression(context.tsDao, Set("Blade", "Chassis", "cpu-id"), True)
    val minimumCpusWithIssue = DynamicParameterExpression.withConstantDefault(numOfCpusParameter.getName, numOfCpusParameter.getDefaultValue.asInteger().toInt)

    val loopOnCpus = StatusTreeExpression(cpusQuery, forTsCondition).withoutInfo().asCondition(minimumIssueCount = minimumCpusWithIssue)

    val headline = ConstantExpression("High CPU usage of specific blade CPUs")
    val description = ConstantExpression("Some CPUs are under high usage.")
    val remediation = ConditionalRemediationSteps("Determine the cause for the high CPU usage of the listed CPUs.",
      ConditionalRemediationSteps.OS_NXOS ->
        """1. Run the "show processes cpu" NX-OS command in order to show the CPU usage at the process level
          |2. Use the "show processes cpu history" command  to display the CPU usage for the last 60 seconds, 60 minutes, and 72 hours. Be sure to check the average CPU usage (#) and the spikes (*).
          |3. A quick way to determine the cause of high CPU usage is to use the "show system internal processes cpu" NX-OS command. This command was added in Release 6.2 and displays the CPU usage information for all threads that belong to a specific PID. Mostly likely, high CPU usage triggered by traffic would cause Netstack, as well as other features and processes such as Address Resolution Protocol (ARP) and Internet Group Management Protocol (IGMP), to run high.
          |4. Review the logs for relevant findings.
          |5. For more information please review: <a target="_blank" href="https://www.cisco.com/c/en/us/support/docs/interfaces-modules/nexus-7000-series-supervisor-1-module/116137-trouble-nexus7000-highcpu-00.html#anc5">High CPU Troubleshooting Guide</a>.""".stripMargin,
      ConditionalRemediationSteps.VENDOR_JUNIPER ->
        """|A Juniper SRX device may start dropping packets if CPU utilization reaches 100%. In order to determine the root cause of high CPU usage:
           |1. Check the CPU status in the routing engine by running "show chassis routing-engine" in the command line interface (CLI).
           |2. Identify the top running processes which hold most of CPU cycles by running the "show system processes extensive" command.
           |3. Consider restarting or ending processes if too many events are being handled (e.g. sampling, traceoptions, syslog, snmp).
           |4. Check CPU utilization in the forwarding engine by running "show chassis forwarding".  High CPU usage may indicate that the device is reaching the capacity limit.
           |5. Review the following article on Juniper tech support site: <a target="_blank" href="https://kb.juniper.net/InfoCenter/index?page=content&id=KB20989">[SRX/J-series] g_down process is going high; CPU utilization is going high; idle CPU down to 0%</a>.
           |6. If the problem persists, contact the Juniper Networks Technical Assistance Center (JTAC).""".stripMargin
    )

    val devicesFilter = Equals("model", "CheckPoint61k")
    val devicesQuery = SelectTagsExpression(context.metaDao, Set(DeviceKey), devicesFilter)

    StatusTreeExpression(devicesQuery, loopOnCpus).withRootInfo(
      headline, description, remediation
    )
  }
}