High load average-checkpoint-secureplatform

High load average-checkpoint-secureplatform

Vendor: checkpoint

OS: secureplatform

Description:
indeni will trigger an issue when the load average on a given device seems high.

Remediation Steps:
Review the current activity on the device to determine if there is a specific cause for this.

chkp-secureplatform-load_averages

name: chkp-secureplatform-load_averages
description: displays the load average of the system
type: monitoring
monitoring_interval: 10 minute
requires:
    vendor: checkpoint
    os.name: secureplatform
comments:
    load-average-one-minute:
        why: |
            To check the normalized load average time for the 1st minute
        how: |
            By checking the first value dispalyed in the linux commmand "uptime"
        can-with-snmp: false
        can-with-syslog: false

    load-average-five-minutes:
        why: |
            To check the load average time for the 5th  minute
        how: |
            By checking the second value dispalyed in the linux commmand "uptime"
        can-with-snmp: false
        can-with-syslog: false

    load-average-fifteen-minutes:
        why: |
            To check the normalized load average time for the 5th  minute
        how: |
            By checking the second value dispalyed in the linux commmand "uptime"
        can-with-snmp: false
        can-with-syslog: false

    load-average-fifteen-minutes-live-config:
        why: |
            To check the normalized load average time for the 15th minute
        how: |
            By checking the third value dispalyed in the linux commmand "uptime"
        can-with-snmp: false
        can-with-syslog: false

    load-average-one-minute-live-config:
        why: |
            To check the load average time for the 1st minute
        how: |
            By checking the first value dispalyed in the linux commmand "uptime"
        can-with-snmp: false
        can-with-syslog: false

    load-average-five-minutes-live-config:
        why: |
            To check the load average time for the 5th  minute
        how: |
            By checking the second value dispalyed in the linux commmand "uptime"
        can-with-snmp: false
        can-with-syslog: false
steps:
   -  run:
          type: SSH
          command: ${nice-path} -n 15 uptime ; ${nice-path} -n 15 egrep -e 'processor'
              /proc/cpuinfo | wc -l
      parse:
          type: AWK
          file: load-average.parser.1.awk

linux_high_load_average

package com.indeni.server.rules.library.core

import com.indeni.apidata.time.TimeSpan
import com.indeni.ruleengine.expressions.OptionalExpression
import com.indeni.ruleengine.expressions.conditions.GreaterThanOrEqual
import com.indeni.ruleengine.expressions.core._
import com.indeni.ruleengine.expressions.data.{SelectTagsExpression, SelectTimeSeriesExpression, TimeSeriesExpression}
import com.indeni.server.common.data.conditions.True
import com.indeni.server.params.ParameterDefinition
import com.indeni.server.params.ParameterDefinition.UIType
import com.indeni.server.rules._
import com.indeni.server.rules.library.{ConditionalRemediationSteps, PerDeviceRule, RuleHelper}
import com.indeni.server.sensor.models.managementprocess.alerts.dto.AlertSeverity
import com.indeni.server.rules.library.core.HighLoadAverageRule._


case class HighLoadAverageRule() extends PerDeviceRule with RuleHelper {

  private[library] val highThresholdParameterName = "High_Threshold_of_Load_Average"
  private val highThresholdParameter = new ParameterDefinition(highThresholdParameterName,
    "",
    "High Threshold of Five Minute Load Average",
    "What is the threshold for the five-minute load average for which once it is crossed an issue will be triggered.",
    UIType.DOUBLE,
    1.5)

  override val metadata: RuleMetadata = RuleMetadata.builder(NAME, "High load average",
    "indeni will trigger an issue when the load average on a given device seems high.",
    AlertSeverity.ERROR,
    categories= Set(RuleCategory.HealthChecks), deviceCategory = DeviceCategory.LinuxbasedDevices).interval(TimeSpan.fromMinutes(10)).configParameter(highThresholdParameter).build()

  override def expressionTree(context: RuleContext): StatusTreeExpression = {
    val actualValue = TimeSeriesExpression[Double]("load-average-five-minutes").last
    val threshold: OptionalExpression[Double] = getParameterDouble(highThresholdParameter)

    StatusTreeExpression(
      // Which objects to pull (normally, devices)
      SelectTagsExpression(context.metaDao, Set(DeviceKey), True),

          StatusTreeExpression(
            // The time-series we check the test condition against:
            SelectTimeSeriesExpression[Double](context.tsDao, Set("load-average-five-minutes"), denseOnly = false),

            // The condition which, if true, we have an issue. Checked against the time-series we've collected
            GreaterThanOrEqual(
              actualValue,
              threshold)

            // The Alert Item to add for this specific item
          ).withRootInfo(
              getHeadline(),
              scopableStringFormatExpression("The five-minute load average is %.0f, above the threshold of %.0f.", actualValue, threshold),
              ConditionalRemediationSteps("Review the current activity on the device to determine if there is a specific cause for this.")
          ).asCondition()
    ).withoutInfo()
  }
}

object HighLoadAverageRule {

  /* --- Constants --- */

  private[library] val NAME = "linux_high_load_average"
}