High CPU Usage per Chassis and Blade-checkpoint-secureplatform

High CPU Usage per Chassis and Blade-checkpoint-secureplatform
0

High CPU Usage per Chassis and Blade-checkpoint-secureplatform

Vendor: checkpoint

OS: secureplatform

Description:
indeni will trigger an issue when CPU usage per chassis and blade is high.

Remediation Steps:
Determine the cause for the high CPU usage of the listed CPUs.

How does this work?
CPU statistics are taken from /proc/stat and a 5 second average is calculated from this.

Why is this important?
High CPU could cause traffic to be dropped, and would indicate a performance problem.

Without Indeni how would you find this?
An administrator could log in and manually check CPU usage. It is also visible in SmartView Monitor.

chkp-secureplatform-proc-stat

name: chkp-secureplatform-proc-stat
description: displays CPU usage
type: monitoring
monitoring_interval: 1 minute
includes_resource_data: true
requires:
    vendor: checkpoint
    os.name: secureplatform
comments:
    cpu-usage:
        why: |
            High CPU could cause traffic to be dropped, and would indicate a performance problem.
        how: |
            CPU statistics are taken from /proc/stat and a 5 second average is calculated from this.
        without-indeni: |
            An administrator could log in and manually check CPU usage. It is also visible in SmartView Monitor.
        can-with-snmp: true
        can-with-syslog: false
        vendor-provided-management: CPU usage can be checked with CLI commands "top"
            and "cpview". It can also be viewed in SmartView Monitor.
steps:
-   run:
        type: SSH
        command: ${nice-path} -n 15 cat /proc/stat && echo "END" && sleep 5 && ${nice-path}
            -n 15 cat /proc/stat
    parse:
        type: AWK
        file: proc-stat.parser.1.awk

high_per_chassis_blade_cpu_usage

package com.indeni.server.rules.library.core
import com.indeni.apidata.time.TimeSpan
import com.indeni.ruleengine.Scope.{Scope, ScopeValueHelper}
import com.indeni.ruleengine.expressions.Expression
import com.indeni.ruleengine.expressions.conditions.GreaterThanOrEqual
import com.indeni.ruleengine.expressions.core.{StatusTreeExpression, _}
import com.indeni.ruleengine.expressions.data.{SelectTagsExpression, SelectTimeSeriesExpression, TimeSeriesExpression}
import com.indeni.ruleengine.expressions.math.MinExpression
import com.indeni.ruleengine.expressions.scope.ScopableExpression
import com.indeni.server.common.ParameterValue
import com.indeni.server.common.data.conditions.{Equals, True}
import com.indeni.server.params.ParameterDefinition
import com.indeni.server.params.ParameterDefinition.UIType
import com.indeni.server.rules._
import com.indeni.server.rules.config.expressions.DynamicParameterExpression
import com.indeni.server.rules.library.{ConditionalRemediationSteps, PerDeviceRule}
import com.indeni.server.sensor.models.managementprocess.alerts.dto.AlertSeverity


/**
  * Created by amir on 03/02/2016.
  */
case class HighPerChassisBladeCpuUsageRule() extends PerDeviceRule {


  private val highThresholdParameterName = "High_Threshold_of_CPU_Usage"
  private val highThresholdParameter = new ParameterDefinition(highThresholdParameterName,
    "",
    "High Threshold of CPU Usage",
    "What is the threshold for the CPU usage for which once it is crossed an issue will be triggered. The CPU usage must be above this threshold constantly for a certain time frame in order for a issue to be triggered.",
    UIType.DOUBLE,
    new ParameterValue((70.0).asInstanceOf[Object])
  )

  private val numOfCpusParameterName = "higher_than_threshold_cpues"
  private val numOfCpusParameter = new ParameterDefinition(numOfCpusParameterName,
    "",
    "Number of CPUs",
    "The number of CPUs with usage above the value set in " + "\"" + highThresholdParameter.getFriendlyName + "\"" + " before a issue is triggered.",
    UIType.INTEGER,
    new ParameterValue((1).asInstanceOf[Object])
  )

  private val reviewedTimeframeParameterName: String = "reviewed_timeframe"

  private val reviewedTimeframeParameter = new ParameterDefinition(reviewedTimeframeParameterName,
    "",
    "Reviewed Timeframe",
    "The CPU usage must be above the value set in " + "\"" + highThresholdParameter.getFriendlyName + "\" for this amount of time before a issue is triggered.",
    UIType.TIMESPAN,
    TimeSpan.fromMinutes(10))

  override val metadata: RuleMetadata = RuleMetadata.builder("high_per_chassis_blade_cpu_usage", "High CPU Usage per Chassis and Blade", "indeni will trigger an issue when CPU usage per chassis and blade is high.", AlertSeverity.ERROR,
    categories = Set(RuleCategory.HealthChecks), deviceCategory = DeviceCategory.AllDevices).configParameters(highThresholdParameter, numOfCpusParameter, reviewedTimeframeParameter).build()

  override def expressionTree(context: RuleContext): StatusTreeExpression = {

    val usageHistory = TimeSeriesExpression[Double]("cpu-usage")
    val minimumUsage = MinExpression(usageHistory)
    val usageThreshold = DynamicParameterExpression.withConstantDefault(highThresholdParameter.getName, highThresholdParameter.getDefaultValue.asDouble.toDouble).noneable
    val isUsageAboveThreshold = GreaterThanOrEqual(minimumUsage, usageThreshold)

    val cpuFailDescription = new ScopableExpression[String] {

      override protected def evalWithScope(time: Long, scope: Scope): String =
        "Cpu usage (" + minimumUsage.eval(time).get.round + "%) above threshold (" + usageThreshold.eval(time).get +
          "%) of " + scope.getVisible("cpu-id").get + " of chassis " + scope.getVisible("Chassis").get + ", blade " + scope.getVisible("Blade").get

      override def args: Set[Expression[_]] = Set(minimumUsage, usageThreshold)
    }

    val cpuFailHeadline = new ScopableExpression[String] {
      override protected def evalWithScope(time: Long, scope: Scope): String = "chassis " + scope.getVisible("Chassis").get + ", blade " + scope.getVisible("Blade").get + ", cpu " + scope.getVisible("cpu-id").get

      override def args: Set[Expression[_]] = Set()
    }

    val historyLength = DynamicParameterExpression.withConstantDefault(reviewedTimeframeParameter.getName, reviewedTimeframeParameter.getDefaultValue.asTimeSpan)
    val tsQuery = SelectTimeSeriesExpression[Double](context.tsDao, Set("cpu-usage"), historyLength, denseOnly = true)

    val forTsCondition = StatusTreeExpression(tsQuery, isUsageAboveThreshold).withSecondaryInfo(
      cpuFailHeadline, cpuFailDescription, title = "Blades with High CPU Usage"
    ).asCondition()

    val cpusQuery = SelectTagsExpression(context.tsDao, Set("Blade", "Chassis", "cpu-id"), True)
    val minimumCpusWithIssue = DynamicParameterExpression.withConstantDefault(numOfCpusParameter.getName, numOfCpusParameter.getDefaultValue.asInteger().toInt)

    val loopOnCpus = StatusTreeExpression(cpusQuery, forTsCondition).withoutInfo().asCondition(minimumIssueCount = minimumCpusWithIssue)

    val headline = ConstantExpression("High CPU usage of specific blade CPUs")
    val description = ConstantExpression("Some CPUs are under high usage.")
    val remediation = ConditionalRemediationSteps("Determine the cause for the high CPU usage of the listed CPUs.",
      RemediationStepCondition.VENDOR_CISCO ->
        """1. Run the "show processes cpu" NX-OS command in order to show the CPU usage at the process level
          |2. Use the "show processes cpu history" command  to display the CPU usage for the last 60 seconds, 60 minutes, and 72 hours. Be sure to check the average CPU usage (#) and the spikes (*).
          |3. A quick way to determine the cause of high CPU usage is to use the "show system internal processes cpu" NX-OS command. This command was added in Release 6.2 and displays the CPU usage information for all threads that belong to a specific PID. Mostly likely, high CPU usage triggered by traffic would cause Netstack, as well as other features and processes such as Address Resolution Protocol (ARP) and Internet Group Management Protocol (IGMP), to run high.
          |4. Review the logs for relevant findings.
          |5. For more information please review: <a target="_blank" href="https://www.cisco.com/c/en/us/support/docs/interfaces-modules/nexus-7000-series-supervisor-1-module/116137-trouble-nexus7000-highcpu-00.html#anc5">High CPU Troubleshooting Guide</a>.""".stripMargin,
      RemediationStepCondition.VENDOR_JUNIPER ->
        """|A Juniper SRX device may start dropping packets if CPU utilization reaches 100%. In order to determine the root cause of high CPU usage:
           |1. Check the CPU status in the routing engine by running "show chassis routing-engine" in the command line interface (CLI).
           |2. Identify the top running processes which hold most of CPU cycles by running the "show system processes extensive" command.
           |3. Consider restarting or ending processes if too many events are being handled (e.g. sampling, traceoptions, syslog, snmp).
           |4. Check CPU utilization in the forwarding engine by running "show chassis forwarding".  High CPU usage may indicate that the device is reaching the capacity limit.
           |5. Review the following article on Juniper tech support site: <a target="_blank" href="https://kb.juniper.net/InfoCenter/index?page=content&id=KB20989">[SRX/J-series] g_down process is going high; CPU utilization is going high; idle CPU down to 0%</a>.
           |6. If the problem persists, contact the Juniper Networks Technical Assistance Center (JTAC).""".stripMargin
    )

    val devicesFilter = Equals("model", "CheckPoint61k")
    val devicesQuery = SelectTagsExpression(context.metaDao, Set(DeviceKey), devicesFilter)

    StatusTreeExpression(devicesQuery, loopOnCpus).withRootInfo(
      headline, description, remediation
    )
  }
}