Template name
ACS-CS-DedicatedMigration: Dedicated ACK master hibernation and etcd backup upload
Template description
Hibernates master nodes and backs up etcd data in a Container Service for Kubernetes (ACK) dedicated cluster and uploads the backup data to an Object Storage Service (OSS) bucket.
Template type
Automated
Owner
Alibaba Cloud
Input parameters
Parameter | Description | Data type | Required | Default value | Limit |
targets | The destination instances. | Json | Yes | ||
BucketName | The name of the OSS bucket to which you want to upload the snapshot. | String | Yes | ||
OSSEndpoint | The endpoint of the OSS bucket to which you want to upload the snapshot. | String | Yes | ||
ClusterID | The cluster ID. | String | Yes | ||
regionId | The region ID. | String | No | {{ ACS::RegionId }} | |
workingDir | The directory in which the command is run in the Elastic Compute Service (ECS) instance. | String | No | /root | |
rateControl | Task execution concurrency | Json | No | {'Mode': 'Concurrency', 'MaxErrors': 0, 'Concurrency': 5} | |
action | The configuration method. | String | No | rollback | |
OOSAssumeRole | The RAM role that is assumed by CloudOps Orchestration Service (OOS). | String | No | "" |
Output parameters
Parameter | Description | Data type |
sleepOrWakeupControlPlaneOutputs | List | |
etcdCheckoutOutputs | List | |
findLeaderOutputs | List | |
readSignOutputs | List |
Permission policy that is required to execute the template
{
"Version": "1",
"Statement": [
{
"Action": [
"ecs:DescribeInstances",
"ecs:DescribeInvocationResults",
"ecs:DescribeInvocations",
"ecs:RunCommand"
],
"Resource": "*",
"Effect": "Allow"
},
{
"Action": [
"oos:GetApplicationGroup"
],
"Resource": "*",
"Effect": "Allow"
}
]
}
Details
Template content
FormatVersion: OOS-2019-06-01
Description:
en: Hibernates the master nodes of an ACK dedicated cluster and uploads an etcd backup.
zh-cn: 为ACK专有版集群休眠master节点并上传etcd备份
name-en: Hibernate-ACK-Dedicated-Master-And-Upload-etcd-Backup
name-zh-cn: ACK专属版master休眠&etcd备份上传
categories:
- others
Parameters:
regionId:
Type: String
Label:
en: Region ID
zh-cn: 地域ID
AssociationProperty: RegionId
Default: '{{ ACS::RegionId }}'
workingDir:
Label:
en: Working Directory
zh-cn: ECS实例中运行命令的目录
Type: String
Default: /root
rateControl:
Label:
en: Rate Control
zh-cn: 任务执行的并发比率
Type: Json
AssociationProperty: RateControl
Default:
Mode: Concurrency
MaxErrors: 0
Concurrency: 5
targets:
Label:
en: Target Instance
zh-cn: 目标实例
Type: Json
AssociationProperty: Targets
AssociationPropertyMetadata:
ResourceType: 'ALIYUN::ECS::Instance'
RegionId: regionId
action:
Type: String
Label:
en: Action
zh-cn: 配置方式
Default: rollback
AllowedValues:
- migrate
- rollback
OOSAssumeRole:
Label:
en: OOS Assume Role
zh-cn: OOS扮演的RAM角色
Type: String
Default: ''
BucketName:
Label:
en: Bucket Name
zh-cn: 需要上传snapshot的oss路径
Type: String
OSSEndpoint:
Label:
en: OSS Endpoint
zh-cn: 需要上传snapshot的oss对应的endpoint
Type: String
ClusterID:
Label:
en: Cluster ID
zh-cn: 集群的ID
Type: String
RamRole: '{{ OOSAssumeRole }}'
Tasks:
- Name: getInstance
Description:
en: Retrieves the ECS instances.
zh-cn: 获取ECS实例
Action: ACS::SelectTargets
Properties:
ResourceType: ALIYUN::ECS::Instance
RegionId: '{{ regionId }}'
Filters:
- '{{ targets }}'
Outputs:
instanceIds:
Type: List
ValueSelector: Instances.Instance[].InstanceId
- Action: ACS::ECS::RunCommand
OnError: rollback
Description:
en: Hibernates or starts the control plane components on the master nodes.
zh-cn: 休眠或启动Master节点管控组件
Properties:
regionId: '{{ regionId }}'
commandContent: |-
#!/bin/bash
set -e
if [ "{{action}}" = "migrate" ]; then
mkdir -p /etc/kubernetes/manifests.backup
if_move=$(ls /etc/kubernetes/manifests/ | wc -l)
if [ "$if_move" != "0" ]; then
mv -f /etc/kubernetes/manifests/* /etc/kubernetes/manifests.backup/
fi
is_ok=0
set +e
ps -o cmd -p `pidof kubelet` | grep 'container-runtime-endpoint=/var/run/containerd/containerd.sock'
if [ $? -ne 0 ]; then
echo "The container runtime is not containerd."
for ((integer = 0; integer < 150; integer++)); do
count=$(docker ps | grep kube-apiserver | wc -l)
if [ "$count" = "0" ]; then
is_ok=1
break
else
sleep 2
fi
done
else
echo "The container runtime is containerd."
for ((integer = 0; integer < 150; integer++)); do
count=$(crictl --runtime-endpoint /var/run/containerd/containerd.sock ps |grep kube-apiserver | wc -l)
if [ "$count" = "0" ]; then
is_ok=1
break
else
sleep 2
fi
done
fi
set -e
if [ "$is_ok" == "0" ]; then
mv -f /etc/kubernetes/manifests.backup/* /etc/kubernetes/manifests/
echo "Rollback finished."
exit 1
else
echo "The control plane is sleeping now."
fi
elif [ "{{action}}" = "rollback" ]; then
mkdir -p /etc/kubernetes/manifests.backup
if_move=$(ls /etc/kubernetes/manifests.backup/ | wc -l)
if [ "$if_move" != "0" ]; then
mv -f /etc/kubernetes/manifests.backup/* /etc/kubernetes/manifests/
fi
echo "The control plane has woken up."
else
echo "The action must be migrate or rollback."
exit 1
fi
instanceId: '{{ ACS::TaskLoopItem }}'
commandType: RunShellScript
workingDir: '{{ workingDir }}'
timeout: 240
Loop:
Items: '{{ getInstance.instanceIds }}'
RateControl: '{{ rateControl }}'
Outputs:
commandOutputs:
AggregateType: Fn::ListJoin
AggregateField: commandOutput
Outputs:
commandOutput:
ValueSelector: invocationOutput
Type: String
Name: sleepOrWakeupControlPlane
- Action: ACS::ECS::RunCommand
OnError: rollback
Description:
en: Executes a Cloud Assistant command.
zh-cn: 执行云助手命令
Properties:
regionId: '{{ regionId }}'
commandContent: |-
#!/bin/bash
set -e
if [ "{{action}}" = "rollback" ]; then
exit 0
fi
# Get the IP address of eth0.
IP=$(/sbin/ifconfig eth0 | grep inet | grep -v 127.0.0.1 | grep -v inet6 | awk '{print $2}' | tr -d "addr:")
ENDPOINT="https://$IP:2379"
echo "ENDPOINT: "$ENDPOINT
set +e
# Query the status of etcd endpoints to determine if the current node is the etcd leader.
ETCDCTL_API=3 /usr/bin/etcdctl --cacert=/var/lib/etcd/cert/ca.pem --cert=/var/lib/etcd/cert/etcd-server.pem --key=/var/lib/etcd/cert/etcd-server-key.pem --endpoints=$ENDPOINT endpoint status | grep true
if [ $? -ne 0 ]; then
echo "This node is not the etcd leader. Exiting."
exit 0
fi
set -e
yum install curl wget jq -y
if [ ! -f "/tmp/ossutil64" ]; then
# Download ossutil and save it to the /tmp/ directory.
wget -c -t 10 -O /tmp/ossutil64 https://oos-public-{{regionId}}.oss-{{regionId}}-internal.aliyuncs.com/x64/ossutil64
if [ $? -ne 0 ]; then
echo "Failed to download the ossutil tool. Exiting."
exit 1
fi
chmod +x /tmp/ossutil64
fi
if [ ! -f "/tmp/modify-prefix-v2" ]; then
echo "Downloading modify-prefix-v2..."
wget -c -t 10 -O /tmp/modify-prefix-v2 https://aliacs-k8s-{{regionId}}.oss-{{regionId}}-internal.aliyuncs.com/public/pkg/etcd/modify-prefix-v2
if [ $? -ne 0 ]; then
echo "An error occurred while downloading the prefix modification tool. Exiting."
exit 1
fi
chmod +x /tmp/modify-prefix-v2
fi
if ! [[ {{ClusterID}} =~ ^c.* ]];then
echo "clusterID: {{ClusterID}} is not a valid cluster ID. Exiting."
exit 1
fi
echo "clusterID: {{ClusterID}}"
# If the node is the leader, create a snapshot and save it to the /tmp/ directory.
TIMESTAMP=$(date "+%Y%m%d%H%M%S")
mkdir -p /tmp/etcdsnap
set -x
SNAP_NAME=etcd_{{ClusterID}}_$TIMESTAMP
echo "Starting backup. The backup file is /tmp/"$SNAP_NAME
DestPrefix="/"{{ClusterID}}
ETCDCTL_API=3 /usr/bin/etcdctl --cacert=/var/lib/etcd/cert/ca.pem --cert=/var/lib/etcd/cert/etcd-server.pem --key=/var/lib/etcd/cert/etcd-server-key.pem --endpoints=$ENDPOINT snapshot save /tmp/etcdsnap/$SNAP_NAME
set +e
/tmp/modify-prefix-v2 change-prefix --db=/tmp/etcdsnap/$SNAP_NAME --dest-prefix=$DestPrefix
if [ $? -ne 0 ]; then
echo "An error occurred while modifying the prefix. Exiting."
exit 1
fi
set -e
# Obtain the OSS-related information and upload the file.
ROLE=$(curl -s 100.100.100.200/latest/meta-data/ram/security-credentials/)
ROLERES=$(curl -s 100.100.100.200/latest/meta-data/ram/security-credentials/$ROLE)
AccessKeyId=$(echo $ROLERES | jq .AccessKeyId|sed 's/\"//g')
AccessKeySecret=$(echo $ROLERES | jq .AccessKeySecret|sed 's/\"//g')
SecurityToken=$(echo $ROLERES | jq .SecurityToken|sed 's/\"//g')
# Put the object to OSS.
echo "Putting the object to OSS..."
set +e
/tmp/ossutil64 -t $SecurityToken -i $AccessKeyId -k $AccessKeySecret -e {{OSSEndpoint}} cp /tmp/etcdsnap/$SNAP_NAME oss://{{BucketName}}/$SNAP_NAME
if [ $? -ne 0 ]; then
echo "Failed to push data to the {{BucketName}} bucket. Exiting."
exit 1
fi
set -e
# Sign the URL.
oss_url=$(/tmp/ossutil64 -t $SecurityToken -i $AccessKeyId -k $AccessKeySecret -e {{OSSEndpoint}} sign --timeout 2400 oss://{{BucketName}}/$SNAP_NAME | grep -v "elapsed" | tr -d '\n')
set +x
sakey=$(cat /etc/kubernetes/pki/sa.key | base64 -w0)
sapub=$(cat /etc/kubernetes/pki/sa.pub | base64 -w0)
frontcrt=$(cat /etc/kubernetes/pki/front-proxy-ca.crt | base64 -w0)
frontkey=$(cat /etc/kubernetes/pki/front-proxy-ca.key | base64 -w0)
echo "{\"sakey\":\"$sakey\",\"sapub\":\"$sapub\",\"frontcrt\":\"$frontcrt\",\"frontkey\":\"$frontkey\",\"oss_url\":\"$oss_url\"}" >/tmp/etcdsnap/sign
instanceId: '{{ ACS::TaskLoopItem }}'
commandType: RunShellScript
workingDir: '{{ workingDir }}'
timeout: 600
Loop:
Items: '{{ getInstance.instanceIds }}'
RateControl: '{{ rateControl }}'
Outputs:
commandOutputs:
AggregateType: Fn::ListJoin
AggregateField: commandOutput
Outputs:
commandOutput:
ValueSelector: invocationOutput
Type: String
Name: etcdCheckout
- Action: 'ACS::ECS::RunCommand'
OnError: rollback
Description:
en: Executes a Cloud Assistant command.
zh-cn: 执行云助手命令
Properties:
regionId: '{{ regionId }}'
commandContent: |-
#!/bin/bash
if [ "{{action}}" = "rollback" ]; then
exit 0
fi
if [ -e /tmp/etcdsnap/sign ]; then
curl --retry 10 -sSL 100.100.100.200/latest/meta-data/instance-id
fi
instanceId: '{{ ACS::TaskLoopItem }}'
commandType: RunShellScript
workingDir: '{{ workingDir }}'
timeout: 60
Loop:
Items: '{{ getInstance.instanceIds }}'
RateControl: '{{ rateControl }}'
Outputs:
commandOutputs:
AggregateType: 'Fn::ListJoin'
AggregateField: commandOutput
Outputs:
commandOutput:
ValueSelector: invocationOutput
Type: String
Name: findLeader
- Action: 'ACS::ECS::RunCommand'
OnError: rollback
OnSuccess: ACS::END
Description:
en: Executes a Cloud Assistant command.
zh-cn: 执行云助手命令
Properties:
regionId: '{{ regionId }}'
commandContent: |-
#!/bin/bash
if [ "{{action}}" = "rollback" ]; then
exit 0
fi
if [ -e /tmp/etcdsnap/sign ]; then
cat /tmp/etcdsnap/sign
rm -rf /tmp/etcdsnap/sign
fi
instanceId: '{{ ACS::TaskLoopItem }}'
commandType: RunShellScript
workingDir: '{{ workingDir }}'
timeout: 60
Loop:
Items:
'Fn::Intersection':
- '{{ getInstance.instanceIds }}'
- '{{ findLeader.commandOutputs }}'
RateControl: '{{ rateControl }}'
Outputs:
commandOutputs:
AggregateType: Fn::ListJoin
AggregateField: commandOutput
Outputs:
commandOutput:
ValueSelector: invocationOutput
Type: String
Name: readSign
- Action: ACS::ECS::RunCommand
Description:
en: Executes a Cloud Assistant command.
zh-cn: 执行云助手命令
Properties:
regionId: '{{ regionId }}'
commandContent: |-
#!/bin/bash
set -e
mkdir -p /etc/kubernetes/manifests.backup
if_move=$(ls /etc/kubernetes/manifests.backup/ | wc -l)
if [ "$if_move" != "0" ]; then
mv -f /etc/kubernetes/manifests.backup/* /etc/kubernetes/manifests/
fi
echo "The control plane has woken up."
instanceId: '{{ ACS::TaskLoopItem }}'
commandType: RunShellScript
workingDir: '{{ workingDir }}'
timeout: 240
Loop:
Items: '{{ getInstance.instanceIds }}'
RateControl: '{{ rateControl }}'
Outputs:
commandOutputs:
AggregateType: Fn::ListJoin
AggregateField: commandOutput
Outputs:
commandOutput:
ValueSelector: invocationOutput
Type: String
Name: rollback
Outputs:
sleepOrWakeupControlPlaneOutputs:
Type: List
Value: '{{ sleepOrWakeupControlPlane.commandOutputs }}'
etcdCheckoutOutputs:
Type: List
Value: '{{ etcdCheckout.commandOutputs }}'
findLeaderOutputs:
Type: List
Value: '{{ findLeader.commandOutputs }}'
readSignOutputs:
Type: List
Value: '{{ readSign.commandOutputs }}'
Metadata:
ALIYUN::OOS::Interface:
ParameterGroups:
- Parameters:
- ClusterID
- action
- BucketName
- OSSEndpoint
- workingDir
Label:
default:
zh-cn: 配置参数
en: Configuration Parameters
- Parameters:
- regionId
- targets
Label:
default:
zh-cn: 选择实例
en: Select Instances
- Parameters:
- rateControl
- OOSAssumeRole
Label:
default:
zh-cn: 高级选项
en: Advanced Options