Cluster down-checkpoint-False

error
high-availability
false
checkpoint
Cluster down-checkpoint-False
0

#1

Cluster down-checkpoint-False

Vendor: checkpoint

OS: False

Description:
Indeni will alert if a cluster is down or any of the members are inoperable.

Remediation Steps:
Review the cause for one or more members being down or inoperable.
Review other alerts for a cause for the cluster failure.

chkp-cphaprob_state_monitoring-vrrp

#! META
name: chkp-cphaprob_state_monitoring-vrrp
description: Run "cphaprob state" for cluster status monitoring for VRRP
type: monitoring
monitoring_interval: 1 minute
requires:
    vendor: "checkpoint"
    high-availability: "true"
    vrrp: "true"

#! COMMENTS
cluster-mode:
    skip-documentation: true

cluster-member-active:
    skip-documentation: true

cluster-member-active-live-config:
    skip-documentation: true

cluster-member-states:
    skip-documentation: true

cluster-state:
    skip-documentation: true

cluster-state-live-config:
    skip-documentation: true

#! REMOTE::SSH
stty rows 80 ; ${nice-path} -n 15 clish -c "show vrrp" && ${nice-path} -n 15 cphaprob state

#! PARSER::AWK
############
# Why: Important to know the status of the cluster
# How: Use both cphaprob state and show vrrp to get the complete picture
## This script is for VRRP clusters
# A remote peer can have the following states
# Down
# Active
# A local peer can have the following states
# Down
# Ready
# Active
# HA module not started.
### The cluster as a whole can have several states regarding different aspects of the cluster
## Redundancy
# The non-active member could be:
# Ready - This member will take over if the Active one goes down, but it is not syncing state tables, so the failover will cut all connections
# Down - This member will not take over in case the active member goes offline
## Health
# Active - This member is healthy and forwarding traffic
## VRRP
# A VRRP cluster shows both   members as "Active" in cphaprob state. This is normal.
###########

BEGIN {
    cluster_is_active = 0
    tags["name"] = "VRRP"
}

# The following two sections has been added by request of Dan Shouky
# https://indeni.atlassian.net/browse/IKP-1221
# Unfortunately, the following code is duplicated in many .ind scripts.
# If you change something in the following two sections, please find all
# of the other instances of this code and make the change there also.
#Could not acquire the config lock
/Could not acquire the config lock/ {
    if (NR == 1) {
        next
    }
}

#CLINFR0829  Unable to get user permissions
#CLINFR0819  User: johndoe denied access via CLI
#CLINFR0599  Failed to build ACLs
/(CLINFR0829\s+Unable to get user permissions|CLINFR0819\s+User: .+ denied access via CLI|CLINFR0599\s+Failed to build ACLs)/ {
    critical_error = 1
    exit
}

# Find out the real status on the local member, since cphaprob state always shows "Active" even when Standby
#        In Backup state 2
#        In Master state 0
/\s+(In Master state|In Backup state)/ {
    if ($2 == "Backup" && $4 > 0) {
        remote_role = "Active"
    } else {
        remote_role = "Standby"
    }
    if ($2 == "Master" && $4 > 0) {
        local_role = "Active"
    } else {
        local_role = "Standby"
    }
}

# Determine cluster mode
#Cluster Mode:   Sync only (OPSEC) with IGMP Membership
/^Cluster Mode\:\s+Sync only/ {
  clustermode = "Sync only"
  writeComplexMetricString("cluster-mode", tags, clustermode)
}
# Has the cluster at least one healthy member forwarding traffic?
# Match any cluster member being Active (but not Active Attention)
#1 (local)  10.10.6.21      Active
/^\d+\s.*Active$/ {
    cluster_is_active = 1
}

#Number     Unique Address  Assigned Load   State
/Number\s*Unique\s*Address\s*Firewall\s*State/ {
  # Store the state columns for use with getColData
  getColumns(trim($0), "[ ]{2,}", state_columns)
}

/\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/ {
    number = getColData(trim($0), state_columns, "Number")
    unique_address = getColData(trim($0), state_columns, "Unique Address")
    state = getColData(trim($0), state_columns, "Firewall State (*)")

    if (state ~ /Active/) {
        if (local_role == "Active") {
        local_state = 1
        }
        else
        {
            local_state = 0
        }
    }
    else {
        local_state = 0
    }

    all_members_index++
    all_members[all_members_index, "state-description"] = state
    all_members[all_members_index, "id"] = $1
    all_members[all_members_index, "unique-ip"] = unique_address

    if ($2 ~ /local/) {
        all_members[all_members_index, "is-local"] = 1
        if (state == "Active" && local_role) {
            all_members[all_members_index, "state-description"] = local_role
        }

        local_state_description = state
        writeComplexMetricStringWithLiveConfig("cluster-member-active-live-config", tags, local_state_description, "Cluster member state (this)")
        writeDoubleMetric("cluster-member-active", tags, "gauge", "60", local_state)
    }
    else {
        all_members[all_members_index, "is-local"] = 0
        if (state == "Active" && remote_role) {
            all_members[all_members_index, "state-description"] = remote_role
        }
    }
}

END {
    # There was an issue when fetching the config, see the note above regarding IKP-1221
    # Exiting the script
    if (critical_error) {
        exit
    }

    # Not good if both members have active interfaces
    if (local_role == "Active" && remote_role == "Active") {
        both_active = "true"
    }

    # If at least one member forwarding traffic is found it is ok. If the cluster services are not running the state is unclear and an alert is issued.
    clusterstate = 0
    if (cluster_is_active && both_active != "true") {
        clusterstate = 1
    }
    cluster_state_description = "Local role: " local_role " - Remote role: " remote_role

    writeComplexMetricStringWithLiveConfig("cluster-state-live-config", tags, cluster_state_description, "Cluster State")
    writeDoubleMetric("cluster-state", tags, "gauge", "60", clusterstate)

    # Write status of all members
    if (all_members_index) {
        writeComplexMetricObjectArray("cluster-member-states", tags, all_members)
    }
}

cross_vendor_cluster_down_novsx

package com.indeni.server.rules.library.templatebased.crossvendor

import com.indeni.ruleengine.expressions.conditions.{Equals => RuleEquals, Not => RuleNot, Or => RuleOr}
import com.indeni.server.common.data.conditions.{Equals => DataEquals, Not => DataNot}
import com.indeni.server.rules.RuleContext
import com.indeni.server.rules.library._
import com.indeni.server.rules.library.templates.StateDownTemplateRule

/**
  *
  */
case class cross_vendor_cluster_down_novsx() extends StateDownTemplateRule(
  ruleName = "cross_vendor_cluster_down_novsx",
  ruleFriendlyName = "Clustered Devices (Non-VS): Cluster down",
  ruleDescription = "Indeni will alert if a cluster is down or any of the members are inoperable.",
  metricName = "cluster-state",
  applicableMetricTag = "name",
  metaCondition = !DataEquals("vsx", "true"),
  alertItemsHeader = "Clustering Elements Affected",
  alertDescription = "One or more clustering elements in this device are down. This alert was added per the request of <a target=\"_blank\" href=\"http://il.linkedin.com/pub/gal-vitenberg/83/484/103\">Gal Vitenberg</a>.",
  baseRemediationText = "Review the cause for one or more members being down or inoperable.")(
  ConditionalRemediationSteps.VENDOR_CP -> "Review other alerts for a cause for the cluster failure.",
  ConditionalRemediationSteps.VENDOR_PANOS -> "Log into the device over SSH and run \"less mp-log ha-agent.log\" for more information.",
  ConditionalRemediationSteps.OS_NXOS ->
    """|
      |1. Verify the communication between the FHRP peers . A random, momentary loss of data communication between the peers is the most common problem that results in continuous FHRP state change (ACT<-> STB) unless this error message occurs during the initial installation.
      |2. Check the CPU utilization by using the "show process CPU" NX-OS command. FHRP state changes are often due to High CPU Utilization.
      |3. Common problems for the loss of FHRP packets between the peers to investigate are physical layer problems, excessive network traffic caused by spanning tree issues or excessive traffic caused by each Vlan.
      |
      |In the case of a vPC problem, validate the following:
      |1. Check that STP bridge assurance is not enabled on the vPC links. Bridge assurance should only be enabled on the vPC peer link
      |2. Compare the vPC domain IDs of the two switches and ensure that they match. Execute the "show vpc brief"  to compare the output that should match across the vPC peer switches.
      |3. Verify that both the source and destination IP addresses used for the peer-keepalive messages are reachable from the VRF associated with the vPC peer-keepalive link.
      |Then, execute the "sh vpc peer-keepalive" NX-OS command and review the output from both switches.
      |4. Verify that the peer-keepalive link is up. Otherwise, the vPC peer link will not come up.
      |5. Review the vPC peer link configuration, execute the "sh vpc brief" NX-OS command and review the output. Besides, verify that the vPC peer link is configured as a Layer 2 port channel trunk that allows only vPC VLANs.
      |6. Ensure that type 1 consistency parameters match. If they do not match, then vPC is suspended. Items that are type 2 do not have to match on both Nexus switches for the vPC to be operational. Execute the "sh vpc consistency-parameters" command and review the output
      |7. Verify that the vPC number that you assigned to the port channel that connects to the downstream device from the vPC peer device is identical on both vPC peer devices
      |8. If you manually configured the system priority, verify that you assigned the same priority value on both vPC peer devices
      |9. Verify that the primary vPC is the primary STP root and the secondary vPC is the secondary STP root.
      |10. Review the logs for relevant findings
      |11. For more information please review  the next  vPC troubleshooting guide:
      |https://www.cisco.com/c/en/us/td/docs/switches/datacenter/nexus5000/sw/troubleshooting/guide/N5K_Troubleshooting_Guide/n5K_ts_vpc.html""".stripMargin
)