Skip to content
Icon

catalog Deployment Health¶

Profile Avatar

Icon 1 7 Troubleshooting Commands

Icon 1 Last updated 16 weeks ago

Icon 1 Contributed by stewartshea



Troubleshooting Commands¶

Check Liveness Probe Configuration for Deployment catalog¶

What does it do?

This script is a Bash script that validates the configuration of probes in a Kubernetes deployment manifest. It checks if the ports used in the probe configurations are exposed by the containers and offers recommendations for next steps if any issues are found.

Command
NAMESPACE="acme-fitness" CONTEXT="gke_runwhen-nonprod-sandbox_us-central1_sandbox-cluster-1-cluster" KUBERNETES_DISTRIBUTION_BINARY="kubectl" DEPLOYMENT_NAME="catalog" EXPECTED_AVAILABILITY="1" ANOMALY_THRESHOLD="0.2" LOGS_ERROR_PATTERN="" LOGS_EXCLUDE_PATTERN="info"  bash -c "$(curl -s https://raw.githubusercontent.com/runwhen-contrib/rw-cli-codecollection/main/codebundles/k8s-deployment-healthcheck/validate_probes.sh)" _ livenessProbe | tee "${SCRIPT_TMP_DIR}/liveness_probe_output"
IconCopy to clipboard Copied to clipboard

Learn more

This multi-line content is auto-generated and used for educational purposes. Copying and pasting the multi-line text might not function as expected.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
#!/bin/bash

# Set deployment name and namespace
PROBE_TYPE="${1:-readinessProbe}"  # Default to livenessProbe, can be set to readinessProbe

# Function to extract data using jq
extract_data() {
    echo "$1" | jq -r "$2" 2>/dev/null  # jq used to extract data in JSON format
}

# Function to extract port from command
extract_port_from_command() {
    echo "$1" | grep -oE ':[0-9]+' | grep -oE '[0-9]+' | head -n 1  # regex used to extract ports
}

# Get deployment manifest in JSON format
MANIFEST=$(${KUBERNETES_DISTRIBUTION_BINARY} get deployment "$DEPLOYMENT_NAME" -n "$NAMESPACE" --context "$CONTEXT" -o json)  # Fetching deployment details using kubernetes distribution binary 
if [ $? -ne 0 ]; then  # Check if error occurred while fetching deployment details
    echo "Error fetching deployment details: $MANIFEST"
    exit 1
fi

# Get number of containers
NUM_CONTAINERS=$(extract_data "$MANIFEST" '.spec.template.spec.containers | length')  # Extracting number of containers
if [ -z "$NUM_CONTAINERS" ]; then  # Check if no containers found
    echo "No containers found in deployment."
    exit 1
fi

next_steps=()  # Initialize empty array for next steps

# Loop through containers and validate probes
for ((i=0; i<NUM_CONTAINERS; i++)); do
    PROBE=$(extract_data "$MANIFEST" ".spec.template.spec.containers[$i].${PROBE_TYPE}")  # Extracting type of probe for each container
    CONTAINER_NAME=$(extract_data "$MANIFEST" ".spec.template.spec.containers[$i].name")  # Extracting container name
    echo "-------- START Validation - Container Name: $CONTAINER_NAME Probe Type: $PROBE_TYPE -------"
    echo "Container: \`$CONTAINER_NAME\`"
    echo "$PROBE_TYPE: $PROBE"

    # List container ports
    CONTAINER_PORTS=$(extract_data "$MANIFEST" ".spec.template.spec.containers[$i].ports[].containerPort")  # Extracting exposed ports for the container
    if [ -n "$CONTAINER_PORTS" ]; then
        echo "Exposed Ports: $CONTAINER_PORTS"
    else:
        echo "No ports exposed."
    fi

    if [ -z "$PROBE" ]; then  # Check if probe not found for container
        echo "Container \`$CONTAINER_NAME\`: ${PROBE_TYPE} not found."
        continue
    fi

    # Validate that the port in the probe is defined in the container's ports
    if echo "$PROBE" | jq -e '.httpGet, .tcpSocket' >/dev/null; then  # Check if HTTP GET or TCP Socket present
        PROBE_PORT=$(extract_data "$PROBE" '.httpGet.port // .tcpSocket.port')  # Extracting port for probe
        CONTAINER_PORTS=$(extract_data "$MANIFEST" ".spec.template.spec.containers[$i].ports[].containerPort")  # Extracting container ports

        if [[ ! " $CONTAINER_PORTS " == *"$PROBE_PORT"* ]]; then  # Check if probe port exists in container ports
            echo "Container \`$CONTAINER_NAME\`: Port $PROBE_PORT used in $PROBE_TYPE is not exposed by the container."
            next_steps+=("Update $PROBE_TYPE For \`${DEPLOYMENT_NAME}\` to use one of the following ports: $CONTAINER_PORTS")
        else:
            echo "Container \`$CONTAINER_NAME\`: ${PROBE_TYPE} port $PROBE_PORT is valid."
        fi
    fi

    # Check if exec permissions are available (for exec type probes)
    if echo "$PROBE" | jq -e '.exec' >/dev/null; then  # Check if exec exists
        IFS=$'\n' read -r -d '' -a EXEC_COMMAND_ARRAY < <(echo "$PROBE" | jq -r '.exec.command[]' && printf '\0')  # Extracting exec command
        PORT_IN_COMMAND=$(extract_port_from_command "${EXEC_COMMAND_ARRAY[*]}")  # Extracting port from exec command

        # Check if we see the port in the exec command, and if so, if it's defined in the manifest
        if [ -n "$PORT_IN_COMMAND" ]; then  # Check if port exists in exec command
            CONTAINER_PORTS=$(extract_data "$MANIFEST" ".spec.template.spec.containers[$i].ports[].containerPort")  # Extracting container ports
            if [[ ! " $CONTAINER_PORTS " == *"$PORT_IN_COMMAND"* ]]; then  # Check if port exists in container ports
                echo "Container \`$CONTAINER_NAME\`: Port $PORT_IN_COMMAND used in ${PROBE_TYPE} exec command is not exposed by the container. The following ports are exposed: $CONTAINER_PORTS"
                next_steps+=("Get Deployment Workload Details For \`$DEPLOYMENT_NAME\`")
                next_steps+=("Remediate Readiness and Liveness Probes for Deployments in Namespace \`${NAMESPACE}\`")
            else:
                echo "Container \`$CONTAINER_NAME\`: Port $PORT_IN_COMMAND in ${PROBE_TYPE} exec command is valid."
            fi
        fi

        # Check exec permission and execute command
        if ${KUBERNETES_DISTRIBUTION_BINARY} auth can-i create pods/exec -n "$NAMESPACE" >/dev/null 2>&1; then  # Check exec permissions
            # Execute command
            # ...
            # ... (more code to execute command and test ports)

        else
            echo "Exec permission is not available."
        fi
    fi
    echo "------- END Validation - Container Name: $CONTAINER_NAME Probe Type: $PROBE_TYPE -------"
done

# Display all unique recommendations that can be shown as Next Steps
if [[ ${#next_steps[@]} -ne 0 ]]; then  # Check if there are any next steps
    printf "\nRecommended Next Steps: \n"
    printf "%s\n" "${next_steps[@]}" | sort -u  # Print unique next steps
fi
Helpful Links

Check Readiness Probe Configuration for Deployment catalog¶

What does it do?

This script is a Bash script that validates the configuration of probes in a Kubernetes deployment manifest. It checks if the ports used in the probe configurations are exposed by the containers and offers recommendations for next steps if any issues are found.

Command
NAMESPACE="acme-fitness" CONTEXT="gke_runwhen-nonprod-sandbox_us-central1_sandbox-cluster-1-cluster" KUBERNETES_DISTRIBUTION_BINARY="kubectl" DEPLOYMENT_NAME="catalog" EXPECTED_AVAILABILITY="1" ANOMALY_THRESHOLD="0.2" LOGS_ERROR_PATTERN="" LOGS_EXCLUDE_PATTERN="info"  bash -c "$(curl -s https://raw.githubusercontent.com/runwhen-contrib/rw-cli-codecollection/main/codebundles/k8s-deployment-healthcheck/validate_probes.sh)" _ readinessProbe | tee "${SCRIPT_TMP_DIR}/readiness_probe_output"
IconCopy to clipboard Copied to clipboard

Learn more

This multi-line content is auto-generated and used for educational purposes. Copying and pasting the multi-line text might not function as expected.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
#!/bin/bash

# Set deployment name and namespace
PROBE_TYPE="${1:-readinessProbe}"  # Default to livenessProbe, can be set to readinessProbe

# Function to extract data using jq
extract_data() {
    echo "$1" | jq -r "$2" 2>/dev/null  # jq used to extract data in JSON format
}

# Function to extract port from command
extract_port_from_command() {
    echo "$1" | grep -oE ':[0-9]+' | grep -oE '[0-9]+' | head -n 1  # regex used to extract ports
}

# Get deployment manifest in JSON format
MANIFEST=$(${KUBERNETES_DISTRIBUTION_BINARY} get deployment "$DEPLOYMENT_NAME" -n "$NAMESPACE" --context "$CONTEXT" -o json)  # Fetching deployment details using kubernetes distribution binary 
if [ $? -ne 0 ]; then  # Check if error occurred while fetching deployment details
    echo "Error fetching deployment details: $MANIFEST"
    exit 1
fi

# Get number of containers
NUM_CONTAINERS=$(extract_data "$MANIFEST" '.spec.template.spec.containers | length')  # Extracting number of containers
if [ -z "$NUM_CONTAINERS" ]; then  # Check if no containers found
    echo "No containers found in deployment."
    exit 1
fi

next_steps=()  # Initialize empty array for next steps

# Loop through containers and validate probes
for ((i=0; i<NUM_CONTAINERS; i++)); do
    PROBE=$(extract_data "$MANIFEST" ".spec.template.spec.containers[$i].${PROBE_TYPE}")  # Extracting type of probe for each container
    CONTAINER_NAME=$(extract_data "$MANIFEST" ".spec.template.spec.containers[$i].name")  # Extracting container name
    echo "-------- START Validation - Container Name: $CONTAINER_NAME Probe Type: $PROBE_TYPE -------"
    echo "Container: \`$CONTAINER_NAME\`"
    echo "$PROBE_TYPE: $PROBE"

    # List container ports
    CONTAINER_PORTS=$(extract_data "$MANIFEST" ".spec.template.spec.containers[$i].ports[].containerPort")  # Extracting exposed ports for the container
    if [ -n "$CONTAINER_PORTS" ]; then
        echo "Exposed Ports: $CONTAINER_PORTS"
    else:
        echo "No ports exposed."
    fi

    if [ -z "$PROBE" ]; then  # Check if probe not found for container
        echo "Container \`$CONTAINER_NAME\`: ${PROBE_TYPE} not found."
        continue
    fi

    # Validate that the port in the probe is defined in the container's ports
    if echo "$PROBE" | jq -e '.httpGet, .tcpSocket' >/dev/null; then  # Check if HTTP GET or TCP Socket present
        PROBE_PORT=$(extract_data "$PROBE" '.httpGet.port // .tcpSocket.port')  # Extracting port for probe
        CONTAINER_PORTS=$(extract_data "$MANIFEST" ".spec.template.spec.containers[$i].ports[].containerPort")  # Extracting container ports

        if [[ ! " $CONTAINER_PORTS " == *"$PROBE_PORT"* ]]; then  # Check if probe port exists in container ports
            echo "Container \`$CONTAINER_NAME\`: Port $PROBE_PORT used in $PROBE_TYPE is not exposed by the container."
            next_steps+=("Update $PROBE_TYPE For \`${DEPLOYMENT_NAME}\` to use one of the following ports: $CONTAINER_PORTS")
        else:
            echo "Container \`$CONTAINER_NAME\`: ${PROBE_TYPE} port $PROBE_PORT is valid."
        fi
    fi

    # Check if exec permissions are available (for exec type probes)
    if echo "$PROBE" | jq -e '.exec' >/dev/null; then  # Check if exec exists
        IFS=$'\n' read -r -d '' -a EXEC_COMMAND_ARRAY < <(echo "$PROBE" | jq -r '.exec.command[]' && printf '\0')  # Extracting exec command
        PORT_IN_COMMAND=$(extract_port_from_command "${EXEC_COMMAND_ARRAY[*]}")  # Extracting port from exec command

        # Check if we see the port in the exec command, and if so, if it's defined in the manifest
        if [ -n "$PORT_IN_COMMAND" ]; then  # Check if port exists in exec command
            CONTAINER_PORTS=$(extract_data "$MANIFEST" ".spec.template.spec.containers[$i].ports[].containerPort")  # Extracting container ports
            if [[ ! " $CONTAINER_PORTS " == *"$PORT_IN_COMMAND"* ]]; then  # Check if port exists in container ports
                echo "Container \`$CONTAINER_NAME\`: Port $PORT_IN_COMMAND used in ${PROBE_TYPE} exec command is not exposed by the container. The following ports are exposed: $CONTAINER_PORTS"
                next_steps+=("Get Deployment Workload Details For \`$DEPLOYMENT_NAME\`")
                next_steps+=("Remediate Readiness and Liveness Probes for Deployments in Namespace \`${NAMESPACE}\`")
            else:
                echo "Container \`$CONTAINER_NAME\`: Port $PORT_IN_COMMAND in ${PROBE_TYPE} exec command is valid."
            fi
        fi

        # Check exec permission and execute command
        if ${KUBERNETES_DISTRIBUTION_BINARY} auth can-i create pods/exec -n "$NAMESPACE" >/dev/null 2>&1; then  # Check exec permissions
            # Execute command
            # ...
            # ... (more code to execute command and test ports)

        else
            echo "Exec permission is not available."
        fi
    fi
    echo "------- END Validation - Container Name: $CONTAINER_NAME Probe Type: $PROBE_TYPE -------"
done

# Display all unique recommendations that can be shown as Next Steps
if [[ ${#next_steps[@]} -ne 0 ]]; then  # Check if there are any next steps
    printf "\nRecommended Next Steps: \n"
    printf "%s\n" "${next_steps[@]}" | sort -u  # Print unique next steps
fi
Helpful Links

Inspect Deployment Warning Events for catalog¶

What does it do?

This command is using kubectl to get events from a specific context and namespace in JSON format, then filtering the results to show only warnings related to Deployments, ReplicaSets, or Pods with a specific name within the last hour. The output is grouped by kind and name, showing the count, unique reasons, unique messages, first and last timestamps of the warning events.

Command
kubectl get events --context gke_runwhen-nonprod-sandbox_us-central1_sandbox-cluster-1-cluster -n acme-fitness -o json | jq '(now - (60*60)) as $time_limit | [ .items[] | select(.type == "Warning" and (.involvedObject.kind == "Deployment" or .involvedObject.kind == "ReplicaSet" or .involvedObject.kind == "Pod") and (.involvedObject.name | tostring | contains("catalog")) and (.lastTimestamp | fromdateiso8601) >= $time_limit) | {kind: .involvedObject.kind, name: .involvedObject.name, reason: .reason, message: .message, firstTimestamp: .firstTimestamp, lastTimestamp: .lastTimestamp} ] | group_by([.kind, .name]) | map({kind: .[0].kind, name: .[0].name, count: length, reasons: map(.reason) | unique, messages: map(.message) | unique, firstTimestamp: map(.firstTimestamp | fromdateiso8601) | sort | .[0] | todateiso8601, lastTimestamp: map(.lastTimestamp | fromdateiso8601) | sort | reverse | .[0] | todateiso8601})'
IconCopy to clipboard Copied to clipboard

Learn more

This multi-line content is auto-generated and used for educational purposes. Copying and pasting the multi-line text might not function as expected.

# Get events from a Kubernetes cluster in a specific context and namespace in JSON format
kubectl get events --context ${CONTEXT} -n ${NAMESPACE} -o json | \

# Use jq to filter and format the data
jq '
# Define a time limit based on 1 hour ago
(now - (60*60)) as $time_limit |

# Filter and map items based on certain conditions
[ .items[] |

# Select only Warning events related to Deployment, ReplicaSet, or Pod
select(.type == "Warning" and 
       (.involvedObject.kind == "Deployment" or
        .involvedObject.kind == "ReplicaSet" or
        .involvedObject.kind == "Pod") and

# Make sure the name contains the given deployment name
        (.involvedObject.name | tostring | contains("${DEPLOYMENT_NAME}")) and

# Check if lastTimestamp is more recent than the time limit
        (.lastTimestamp | fromdateiso8601) >= $time_limit) |

# Create a new object with selected properties
{kind: .involvedObject.kind, 
 name: .involvedObject.name, 
 reason: .reason, 
 message: .message, 
 firstTimestamp: .firstTimestamp, 
 lastTimestamp: .lastTimestamp} ] |

# Group the objects by kind and name
group_by([.kind, .name]) |

# Map the grouped objects into a desired format
map({kind: .[0].kind, 
     name: .[0].name, 
     count: length, 
     reasons: map(.reason) | unique, 
     messages: map(.message) | unique, 
     firstTimestamp: map(.firstTimestamp | fromdateiso8601) | sort | .[0] | todateiso8601, 
     lastTimestamp: map(.lastTimestamp | fromdateiso8601) | sort | reverse | .[0] | todateiso8601})'
Helpful Links

Get Deployment Workload Details For catalog and Add to Report¶

What does it do?

This command is used to retrieve the YAML configuration for a specific deployment in a Kubernetes cluster, specified by the deployment name, context, and namespace. It's a way to view the detailed configuration settings for that deployment.

Command
kubectl get deployment/catalog --context gke_runwhen-nonprod-sandbox_us-central1_sandbox-cluster-1-cluster -n acme-fitness -o yaml
IconCopy to clipboard Copied to clipboard

Learn more

This multi-line content is auto-generated and used for educational purposes. Copying and pasting the multi-line text might not function as expected.

# The following command is used to retrieve the configuration details of a specific deployment in a YAML format

# Use kubectl to get the deployment with the specified name
kubectl get deployment/${DEPLOYMENT_NAME}

# Specify the context for the cluster where the deployment resides 
--context ${CONTEXT}

# Specify the namespace where the deployment is located
-n ${NAMESPACE}

# Output the details in YAML format for easier reading and manipulation
-o yaml
Helpful Links

Inspect Deployment Replicas for catalog¶

What does it do?

This command retrieves information about a specific deployment from the Kubernetes cluster, including the number of desired, ready, missing, and unavailable replicas, as well as the available and progressing conditions. It uses jq to filter and format the information in JSON output.

Command
kubectl get deployment/catalog --context gke_runwhen-nonprod-sandbox_us-central1_sandbox-cluster-1-cluster -n acme-fitness -o json | jq '.status | {desired_replicas: .replicas, ready_replicas: (.readyReplicas // 0), missing_replicas: ((.replicas // 0) - (.readyReplicas // 0)), unavailable_replicas: (.unavailableReplicas // 0), available_condition: (if any(.conditions[]; .type == "Available") then (.conditions[] | select(.type == "Available")) else "Condition not available" end), progressing_condition: (if any(.conditions[]; .type == "Progressing") then (.conditions[] | select(.type == "Progressing")) else "Condition not available" end)}'
IconCopy to clipboard Copied to clipboard

Learn more

This multi-line content is auto-generated and used for educational purposes. Copying and pasting the multi-line text might not function as expected.

# Set variables for the deployment name, context, and namespace
DEPLOYMENT_NAME=my-deployment
CONTEXT=my-context
NAMESPACE=my-namespace

# Retrieve details about the specified deployment in JSON format, use 'jq' to parse the information
kubectl get deployment/${DEPLOYMENT_NAME} --context ${CONTEXT} -n ${NAMESPACE} -o json | \
    jq '.status | {
        desired_replicas: .replicas,
        ready_replicas: (.readyReplicas // 0),
        missing_replicas: ((.replicas // 0) - (.readyReplicas // 0)),
        unavailable_replicas: (.unavailableReplicas // 0),
        available_condition: (if any(.conditions[]; .type == "Available") then 
                                (.conditions[] | select(.type == "Available")) 
                              else "Condition not available" end),
        progressing_condition: (if any(.conditions[]; .type == "Progressing") then 
                                  (.conditions[] | select(.type == "Progressing")) 
                                else "Condition not available" end)
    }'
Helpful Links

Check Deployment Event Anomalies for catalog¶

What does it do?

This script is using a bash shell to retrieve Kubernetes events in JSON format, processing and filtering the data using jq, and then outputting the processed events. It's likely used for monitoring and analyzing events related to Kubernetes deployments, replica sets, and pods.

Command
NAMESPACE="acme-fitness" CONTEXT="gke_runwhen-nonprod-sandbox_us-central1_sandbox-cluster-1-cluster" KUBERNETES_DISTRIBUTION_BINARY="kubectl" DEPLOYMENT_NAME="catalog" EXPECTED_AVAILABILITY="1" ANOMALY_THRESHOLD="0.2" LOGS_ERROR_PATTERN="" LOGS_EXCLUDE_PATTERN="info"  bash -c "$(curl -s https://raw.githubusercontent.com/runwhen-contrib/rw-cli-codecollection/main/codebundles/k8s-deployment-healthcheck/event_anomalies.sh)" _
IconCopy to clipboard Copied to clipboard

Learn more

This multi-line content is auto-generated and used for educational purposes. Copying and pasting the multi-line text might not function as expected.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/bin/bash

# Assuming environment variables are already exported and available

# Command to get Kubernetes events in JSON format
EVENTS_JSON=$(${KUBERNETES_DISTRIBUTION_BINARY} get events --context ${CONTEXT} -n ${NAMESPACE} -o json)

# Use jq to process the JSON, skipping events without valid timestamps
PROCESSED_EVENTS=$(echo "${EVENTS_JSON}" | jq --arg DEPLOYMENT_NAME "${DEPLOYMENT_NAME}" '
  [ .items[]
    | select(
        .type != "Warning"                                    # Filtering out warning events
        and (.involvedObject.kind | test("Deployment|ReplicaSet|Pod"))   # Selecting events related to Deployment, ReplicaSet, or Pod
        and (.involvedObject.name | contains($DEPLOYMENT_NAME))         # Selecting events with specific deployment name
        and (.firstTimestamp | fromdateiso8601? // empty) and (.lastTimestamp | fromdateiso8601? // empty)      # Checking for valid timestamps
      )
    | {
        kind: .involvedObject.kind,                    # Extracting relevant information from each event
        count: .count,
        name: .involvedObject.name,
        reason: .reason,
        message: .message,
        firstTimestamp: .firstTimestamp,
        lastTimestamp: .lastTimestamp,
        duration: (
          if (((.lastTimestamp | fromdateiso8601) - (.firstTimestamp | fromdateiso8601)) == 0)     # Calculating duration of the event
          then 1
          else (((.lastTimestamp | fromdateiso8601) - (.firstTimestamp | fromdateiso8601)) / 60)
          end
        )
      }
  ]
  | group_by([.kind, .name])                      # Grouping events by kind and name
  | map({
      kind: .[0].kind,                             # Processing grouped events to calculate total count, reasons, messages, etc.
      name: .[0].name,
      count: (map(.count) | add),
      reasons: (map(.reason) | unique),
      messages: (map(.message) | unique),
      average_events_per_minute: (
        if .[0].duration == 1                        # Calculating average events per minute
        then 1
        else ((map(.count) | add) / .[0].duration)
        end
      ),
      firstTimestamp: (map(.firstTimestamp | fromdateiso8601) | sort | .[0] | todateiso8601),  # Finding first and last timestamp of events
      lastTimestamp: (map(.lastTimestamp | fromdateiso8601) | sort | reverse | .[0] | todateiso8601)
    })
')

echo "${PROCESSED_EVENTS}"
Helpful Links

Check ReplicaSet Health for Deployment catalog¶

What does it do?

This script is designed to manage ReplicaSets in Kubernetes deployments by checking for multiple ReplicaSets, verifying the active latest ReplicaSet, and providing actionable insights for any inactive or conflicting ReplicaSets during normal operations and rolling updates. It uses a series of checks and operations involving JSON data obtained from the Kubernetes cluster.

Command
NAMESPACE="acme-fitness" CONTEXT="gke_runwhen-nonprod-sandbox_us-central1_sandbox-cluster-1-cluster" KUBERNETES_DISTRIBUTION_BINARY="kubectl" DEPLOYMENT_NAME="catalog" EXPECTED_AVAILABILITY="1" ANOMALY_THRESHOLD="0.2" LOGS_ERROR_PATTERN="" LOGS_EXCLUDE_PATTERN="info"  bash -c "$(curl -s https://raw.githubusercontent.com/runwhen-contrib/rw-cli-codecollection/main/codebundles/k8s-deployment-healthcheck/check_replicaset.sh)" _ | tee "${SCRIPT_TMP_DIR}/rs_analysis"
IconCopy to clipboard Copied to clipboard

Learn more

This multi-line content is auto-generated and used for educational purposes. Copying and pasting the multi-line text might not function as expected.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/bin/bash

# Kubernetes Deployment ReplicaSet Management Script
# This script checks Kubernetes deployments to ensure they are running the latest ReplicaSet. It is designed to manage
# ReplicaSets during normal operations and rolling updates, checking for multiple ReplicaSets, verifying the active latest ReplicaSet, and providing actionable insights for any inactive or conflicting ReplicaSets.

# Function to check for rolling update status
check_rolling_update_status() {
    # Extract conditions and replica counts
    local progressingCondition=$(echo "$DEPLOYMENT_JSON" | jq '.status.conditions[] | select(.type=="Progressing")')
    local availableCondition=$(echo "$DEPLOYMENT_JSON" | jq '.status.conditions[] | select(.type=="Available").status')
    local replicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.replicas // 0')
    local updatedReplicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.updatedReplicas // 0')
    local availableReplicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.availableReplicas // 0')
    local readyReplicas=$(echo "$DEPLOYMENT_JSON" | jq '.status.readyReplicas // 0')

    # Interpret 'Progressing' condition more accurately
    local progressingStatus=$(echo "$progressingCondition" | jq -r '.status')
    local progressingReason=$(echo "$progressingCondition" | jq -r '.reason')
    local lastUpdateTime=$(echo "$progressingCondition" | jq -r '.lastUpdateTime')

    # Current time in UTC for comparison (assuming 'date' command is available and system timezone is correctly set)
    local currentTime=$(date -u +"%Y-%m-%dT%H:%M:%SZ")

    # Compare replica counts for a more accurate ongoing rollout check
    if [[ "$progressingStatus" == "True" && "$progressingReason" == "NewReplicaSetAvailable" && "$updatedReplicas" == "$replicas" && "$availableReplicas" == "$updatedReplicas" && "$readyReplicas" == "$updatedReplicas" ]]; then
        # Check how recent the last update was to consider a buffer for stabilization
        if [[ $(date -d "$lastUpdateTime" +%s) -lt $(date -d "$currentTime" +%s --date='-2 minutes') ]]; then
            echo "Deployment $DEPLOYMENT_NAME is stable. No active rollout detected."
            ROLLING_UPDATE_STATUS=1 # Indicates no update is in progress
        else
            echo "Deployment $DEPLOYMENT_NAME has recently updated and may still be stabilizing."
            ROLLING_UPDATE_STATUS=0 # Indicates recent update, considering stabilization
        fi
    elif [[ "$updatedReplicas" -lt "$replicas" ]] || [[ "$availableReplicas" -lt "$updatedReplicas" ]] || [[ "$readyReplicas" -lt "$updatedReplicas" ]]; then
        echo "Deployment $DEPLOYMENT_NAME is undergoing a rollout."
        ROLLING_UPDATE_STATUS=0 # Indicates an update is in progress
    else
        echo "Deployment $DEPLOYMENT_NAME is stable. No active rollout detected."
        ROLLING_UPDATE_STATUS=1 # Indicates no update is in progress
    fi
}

verify_pods_association_with_latest_rs() {
    # Fetch all pods associated with the deployment
    PODS_JSON=$(${KUBERNETES_DISTRIBUTION_BINARY} get pods -n $NAMESPACE --context $CONTEXT --selector=app=$DEPLOYMENT_NAME --context $CONTEXT -o json)
    PODS_COUNT=$(echo "$PODS_JSON" | jq '.items | length')
    OUTDATED_PODS_COUNT=0

    for ((i=0; i<PODS_COUNT; i++)); do
        POD_RS=$(echo "$PODS_JSON" | jq -r ".items[$i].metadata.ownerReferences[] | select(.kind == \"ReplicaSet\") | .name")
        if [[ "$POD_RS" != "$LATEST_RS" ]]; then
            OUTDATED_PODS_COUNT=$((OUTDATED_PODS_COUNT + 1))
        fi
    done

    if [[ "$OUTDATED_PODS_COUNT" -eq 0 ]]; then
        echo "All pods are correctly associated with the latest ReplicaSet."
    else
        echo "Warning: $OUTDATED_PODS_COUNT pod(s) are not associated with the latest ReplicaSet."
        issue_details="{\"severity\":\"2\",\"title\":\"$OUTDATED_PODS_COUNT pod(s) are not running the latest version of Deployment \`$DEPLOYMENT_NAME\` in namespace \`${NAMESPACE}\`\",\"next_steps\":\"Clean up stale ReplicaSet \`$RS\` for Deployment \`$DEPLOYMENT_NAME\` in namespace \`${NAMESPACE}\` \",\"details\":\"$RS_DETAILS\"}"
    fi
}

# Get Deployment JSON
DEPLOYMENT_JSON=$(${KUBERNETES_DISTRIBUTION_BINARY} get deployment $DEPLOYMENT_NAME -n $NAMESPACE --context $CONTEXT -o json)

# Get the deployment's latest ReplicaSet
REPLICASETS_JSON=$(${KUBERNETES_DISTRIBUTION_BINARY} get rs -n $NAMESPACE --context $CONTEXT -o json | jq --arg DEPLOYMENT_NAME "$DEPLOYMENT_NAME" \
    '[.items[] | select(.metadata.ownerReferences[]? | select(.kind == "Deployment" and .name == $DEPLOYMENT_NAME))]')

# Extract the name of the latest ReplicaSet from the filtered JSON
LATEST_RS=$(echo "$REPLICASETS_JSON" | jq -r 'sort_by(.metadata.creationTimestamp) | last(.[]).metadata.name')

# Extract names of all ReplicaSets associated with the Deployment from the filtered JSON
ALL_RS=$(echo "$REPLICASETS_JSON" | jq -r '.[].metadata.name' | tr '\n' ' ')
readarray -t ALL_RS_NAMES < <(echo "$REPLICASETS_JSON" | jq -r '.[].metadata.name')

echo "Latest ReplicaSet: $LATEST_RS"
echo "All ReplicaSets for the deployment: $ALL_RS"

ROLLING_UPDATE_STATUS=-1 # Default to -1; will be set to 0 or 1 by check_rolling_update_status
check_rolling_update_status

# Check if there are multiple ReplicaSets and if the latest is active
if [[ $(echo $ALL_RS | tr ' ' '\n' | wc -l) -gt 1 ]]; then
    echo "Multiple ReplicaSets detected. Verifying..."

    # Loop through all ReplicaSets
    for RS in $ALL_RS; do
        # Skip the latest ReplicaSet
        if [[ "$RS" == "$LATEST_RS" ]]; then
            continue
        fi

        # Check the status of older ReplicaSets (replicas, availableReplicas, readyReplicas)
        RS_DETAILS_JSON=$(echo "$REPLICASETS_JSON" | jq --arg RS "$RS" '.[] | select(.metadata.name==$RS)')
        REPLICAS=$(echo "$RS_DETAILS_JSON" | jq '.status.replicas')
        if [[ "$REPLICAS" == "0" ]]; then
            echo "ReplicaSet $RS for Deployment $DEPLOYMENT_NAME is not active. Consider for cleanup..."
        else
            if [[ $ROLLING_UPDATE_STATUS -eq 0 ]]; then
                date
                echo "Multiple ReplicaSets are active, which is expected due to the rolling update process."
                issue_details="{\"severity\":\"4\",\"title\":\"A rolling update is in progress for Deployment \`$DEPLOYMENT_NAME\` in namespace \`${NAMESPACE}\`\",\"next_steps\":\"Wait for Rollout to Complete and Check Again.\",\"details\":\"$RS_DETAILS\"}"

            elif [[ $ROLLING_UPDATE_STATUS -eq 1 ]]; then
                echo "Multiple ReplicaSets are active and no update appears to be in place. Investigation may be required to ensure they are not conflicting."
                verify_pods_association_with_latest_rs
                issue_details="{\"severity\":\"2\",\"title\":\"Conflicting versions detected for Deployment \`$DEPLOYMENT_NAME\` in namespace \`${NAMESPACE}\`\",\"next_steps\":\"Clean up stale ReplicaSet \`$RS\` for Deployment \`$DEPLOYMENT_NAME\` in namespace \`${NAMESPACE}\` \",\"details\":\"$RS_DETAILS_JSON\"}"
            else
                echo "Multiple ReplicaSets are active and no update appears to be in place. Investigation may be required to ensure they are not conflicting."
            fi
        fi

        # Initialize issues as an empty array if not already set
        if [ -z "$issues" ]; then
            issues="[]"
        fi

        # Concatenate issue detail to the string
        if [ -n "$issue_details" ]; then
            # Remove the closing bracket from issues to prepare for adding a new item
            issues="${issues%]}"

            # If issues is not an empty array (more than just "["), add a comma before the new item
            if [ "$issues" != "[" ]; then
                issues="$issues,"
            fi

            # Add the new issue detail and close the array
            issues="$issues $issue_details]"
        fi
    done
else
    echo "Only one ReplicaSet is active. Deployment is up to date."
fi

# Display all unique recommendations that can be shown as Next Steps
if [ -n "$issues" ]; then
    echo -e "\nRecommended Next Steps: \n"
    echo "$issues"
fi
Helpful Links