在大数据场景下,选用ecs.d1ne.6xlarge规格的ECS实例运行Spark作业。每个实例自带12块5 TB的HDD数据盘,需要对这12个数据盘进行手动分区格式化和挂载。当节点数量多时,手动分区格式化和挂载是一个十分繁琐和耗时的操作。本文介绍如何通过LVM数据卷对数据盘格式化和挂载进行简化。

前提条件

部署LVM插件

apiVersion: storage.k8s.io/v1beta1
kind: CSIDriver
metadata:
  name: localplugin.csi.alibabacloud.com
spec:
  attachRequired: false
  podInfoOnMount: true
---
kind: DaemonSet
apiVersion: apps/v1
metadata:
  name: csi-local-plugin
  namespace: kube-system
spec:
  selector:
    matchLabels:
      app: csi-local-plugin
  template:
    metadata:
      labels:
        app: csi-local-plugin
    spec:
      tolerations:
        - operator: Exists
      serviceAccount: admin
      priorityClassName: system-node-critical
      hostNetwork: true
      hostPID: true
      containers:
        - name: driver-registrar
          image: registry.cn-hangzhou.aliyuncs.com/acs/csi-node-driver-registrar:v1.1.0
          imagePullPolicy: Always
          args:
            - "--v=5"
            - "--csi-address=/csi/csi.sock"
            - "--kubelet-registration-path=/var/lib/kubelet/csi-plugins/localplugin.csi.alibabacloud.com/csi.sock"
          env:
            - name: KUBE_NODE_NAME
              valueFrom:
                fieldRef:
                  apiVersion: v1
                  fieldPath: spec.nodeName
          volumeMounts:
            - name: plugin-dir
              mountPath: /csi
            - name: registration-dir
              mountPath: /registration

        - name: csi-localplugin
          securityContext:
            privileged: true
            capabilities:
              add: ["SYS_ADMIN"]
            allowPrivilegeEscalation: true
          image: registry.cn-hangzhou.aliyuncs.com/acs/csi-plugin:v1.14.8.41-bce68b74-aliyun
          imagePullPolicy: "Always"
          args :
            - "--endpoint=$(CSI_ENDPOINT)"
            - "--v=5"
            - "--nodeid=$(KUBE_NODE_NAME)"
            - "--driver=localplugin.csi.alibabacloud.com"
          env:
            - name: KUBE_NODE_NAME
              valueFrom:
                fieldRef:
                  apiVersion: v1
                  fieldPath: spec.nodeName
            - name: DRIVER_VENDOR
              value: localplugin.csi.alibabacloud.com
            - name: CSI_ENDPOINT
              value: unix://var/lib/kubelet/csi-plugins/localplugin.csi.alibabacloud.com/csi.sock
          volumeMounts:
            - name: pods-mount-dir
              mountPath: /var/lib/kubelet
              mountPropagation: "Bidirectional"
            - mountPath: /dev
              mountPropagation: "HostToContainer"
              name: host-dev
            - mountPath: /var/log/
              name: host-log
      volumes:
        - name: plugin-dir
          hostPath:
            path: /var/lib/kubelet/csi-plugins/localplugin.csi.alibabacloud.com
            type: DirectoryOrCreate
        - name: registration-dir
          hostPath:
            path: /var/lib/kubelet/plugins_registry
            type: DirectoryOrCreate
        - name: pods-mount-dir
          hostPath:
            path: /var/lib/kubelet
            type: Directory
        - name: host-dev
          hostPath:
            path: /dev
        - name: host-log
          hostPath:
            path: /var/log/
  updateStrategy:
    rollingUpdate:
      maxUnavailable: 10%
    type: RollingUpdate
	  
kind: Deployment
apiVersion: apps/v1
metadata:
  name: csi-local-provisioner
  namespace: kube-system
spec:
  selector:
    matchLabels:
      app: csi-local-provisioner
  replicas: 2
  template:
    metadata:
      labels:
        app: csi-local-provisioner
    spec:
      tolerations:
      - operator: "Exists"
      affinity:
        nodeAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 1
            preference:
              matchExpressions:
              - key: node-role.kubernetes.io/master
                operator: Exists
      priorityClassName: system-node-critical
      serviceAccount: admin
      hostNetwork: true
      containers:
        - name: external-local-provisioner
          image: registry.cn-hangzhou.aliyuncs.com/acs/csi-provisioner:v1.6.0-b6f763a43-ack
          args:
            - "--csi-address=$(ADDRESS)"
            - "--feature-gates=Topology=True"
            - "--volume-name-prefix=lvm"
            - "--strict-topology=true"
            - "--timeout=150s"
            - "--extra-create-metadata=true"
            - "--enable-leader-election=true"
            - "--leader-election-type=leases"
            - "--retry-interval-start=500ms"
            - "--v=5"
          env:
            - name: ADDRESS
              value: /socketDir/csi.sock
          imagePullPolicy: "Always"
          volumeMounts:
            - name: socket-dir
              mountPath: /socketDir
        - name: external-local-resizer
          image: registry.cn-hangzhou.aliyuncs.com/acs/csi-resizer:v0.3.0
          args:
            - "--v=5"
            - "--csi-address=$(ADDRESS)"
            - "--leader-election"
          env:
            - name: ADDRESS
              value: /socketDir/csi.sock
          imagePullPolicy: "Always"
          volumeMounts:
            - name: socket-dir
              mountPath: /socketDir/
      volumes:
        - name: socket-dir
          hostPath:
            path: /var/lib/kubelet/csi-plugins/localplugin.csi.alibabacloud.com
            type: DirectoryOrCreate
	  
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
  name: node-storage-manager
  namespace: kube-system
spec:
  selector:
    matchLabels:
      app: node-storage-manager
  template:
    metadata:
      labels:
        app: node-storage-manager
    spec:
      containers:
      - args:
        - --nodeid=$(KUBE_NODE_NAME)
        env:
        - name: KUBE_NODE_NAME
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: spec.nodeName
        image: registry.cn-hangzhou.aliyuncs.com/plugins/node-storage-manager:v1.14.8-bac4c12
        imagePullPolicy: Always
        name: node-storage-manager
        securityContext:
          allowPrivilegeEscalation: true
          capabilities:
            add:
            - SYS_ADMIN
          privileged: true
        volumeMounts:
        - mountPath: /dev
          mountPropagation: HostToContainer
          name: host-dev
        - mountPath: /var/log/
          name: host-log
        - mountPath: /host/etc
          name: etc
      hostNetwork: true
      hostPID: true
      priorityClassName: system-node-critical
      restartPolicy: Always
      serviceAccount: admin
      serviceAccountName: admin
      tolerations:
      - operator: Exists
      volumes:
      - hostPath:
          path: /dev
          type: ""
        name: host-dev
      - hostPath:
          path: /var/log/
          type: ""
        name: host-log
      - hostPath:
          path: /etc
          type: ""
        name: etc
  templateGeneration: 1
  updateStrategy:
    rollingUpdate:
      maxUnavailable: 10%
    type: RollingUpdate
	  
您需根据需要配置节点的黑、白名单和pvConfig。通过配置ConfigMap,您可以在目标节点上面使用LVM。
apiVersion: v1
kind: ConfigMap
metadata:
  name: cm-node-storage
  namespace: kube-system
data:
  volumegroup.json: |-
    {
      "volumegroup1": {
        "nodeList": {
            "whiteList": {
                "nodeName": ["all-cluster-nodes"],
                "nodeLabel": ["diskType=ssd", "diskType=hhd"]
            },
            "blackList": {
                "nodeName": [],
                "nodeLabel": []
            }
        },
        "pvConfig": {
            "globalConfig": ["aliyun-local-disk"],
            "specialConfig": {
                "nodeName": {},
                "nodeLabel": {}
            }
        },
        "status": "In_Use"
      }
    }	  
	  

配置PVC

  1. 使用以下模板配置StorageClass。
    allowVolumeExpansion: true
    apiVersion: storage.k8s.io/v1
    kind: StorageClass
    metadata:
      name: csi-local-imm
    parameters:
      fsType: ext4
      lvmType: striping
      vgName: volumegroup1
      volumeType: LVM
    provisioner: localplugin.csi.alibabacloud.com
    reclaimPolicy: Delete
    volumeBindingMode: Immediate
  2. 使用以下模板配置PVC。
    apiVersion: v1
    kind: PersistentVolumeClaim
    metadata:
      name: lvm-pvc-im
    spec:
      accessModes:
      - ReadWriteOnce
      resources:
        requests:
          storage: 12288Gi
      storageClassName: csi-local-imm

配置Alluxio

使用以下模板配置Alluxio。
# The Alluxio Open Foundation licenses this work under the Apache License, version 2.0
# (the "License"). You may not use this work except in compliance with the License, which is
# available at www.apache.org/licenses/LICENSE-2.0
#
# This software is distributed on an "AS IS" basis, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
# either express or implied, as more fully set forth in the License.
#
# See the NOTICE file distributed with this work for information regarding copyright ownership.
#

# This should not be modified in the usual case.
fullnameOverride: alluxio


## Common ##

# Docker Image
image: registry-vpc.cn-beijing.aliyuncs.com/alluxio/alluxio
imageTag: 2.3.0
imagePullPolicy: IfNotPresent

# Security Context
user: 0
group: 0
fsGroup: 0

# Site properties for all the components
properties:
  fs.oss.accessKeyId: YOUR-ACCESS-KEY-ID
  fs.oss.accessKeySecret: YOUR-ACCESS-KEY-SECRET
  fs.oss.endpoint: oss-cn-beijing-internal.aliyuncs.com
  alluxio.master.mount.table.root.ufs: oss://cloudnativeai/
  alluxio.master.persistence.blacklist: .staging,_temporary
  alluxio.security.stale.channel.purge.interval: 365d
  alluxio.user.metrics.collection.enabled: 'true'
  alluxio.user.short.circuit.enabled: 'true'
  alluxio.user.file.write.tier.default: 1
  alluxio.user.block.size.bytes.default: 64MB #default 64MB
  alluxio.user.file.writetype.default: CACHE_THROUGH
  alluxio.user.file.metadata.load.type: ONCE
  alluxio.user.file.readtype.default: CACHE
  #alluxio.worker.allocator.class: alluxio.worker.block.allocator.MaxFreeAllocator
  alluxio.worker.allocator.class: alluxio.worker.block.allocator.RoundRobinAllocator
  alluxio.worker.file.buffer.size: 128MB
  alluxio.worker.evictor.class: alluxio.worker.block.evictor.LRUEvictor
  alluxio.job.master.client.threads: 5000
  alluxio.job.worker.threadpool.size: 300

# Recommended JVM Heap options for running in Docker
# Ref: https://developers.redhat.com/blog/2017/03/14/java-inside-docker/
# These JVM options are common to all Alluxio services
# jvmOptions:
#   - "-XX:+UnlockExperimentalVMOptions"
#   - "-XX:+UseCGroupMemoryLimitForHeap"
#   - "-XX:MaxRAMFraction=2"

# Mount Persistent Volumes to all components
# mounts:
# - name: <persistentVolume claimName>
#   path: <mountPath>

# Use labels to run Alluxio on a subset of the K8s nodes

## Master ##

master:
  count: 1 # Controls the number of StatefulSets. For multiMaster mode increase this to >1.
  replicas: 1 # Controls #replicas in a StatefulSet and should not be modified in the usual case.
  enableLivenessProbe: false
  enableReadinessProbe: false
  args: # Arguments to Docker entrypoint
    - master-only
    - --no-format
  # Properties for the master component
  properties:
  # Example: use ROCKS DB instead of Heap
  # alluxio.master.metastore: ROCKS
  # alluxio.master.metastore.dir: /metastore
  resources:
    # The default xmx is 8G
    limits:
      cpu: "4"
      memory: "8G"
    requests:
      cpu: "1"
      memory: "1G"
  ports:
    embedded: 19200
    rpc: 19998
    web: 19999
  hostPID: true
  hostNetwork: true
  # dnsPolicy will be ClusterFirstWithHostNet if hostNetwork: true
  # and ClusterFirst if hostNetwork: false
  # You can specify dnsPolicy here to override this inference
  # dnsPolicy: ClusterFirst
  # JVM options specific to the master container
  jvmOptions:
  nodeSelector:
    alluxio: 'true'

jobMaster:
  args:
    - job-master
  # Properties for the jobMaster component
  enableLivenessProbe: false
  enableReadinessProbe: false
  properties:
  resources:
    limits:
      cpu: "4"
      memory: "8G"
    requests:
      cpu: "1"
      memory: "1G"
  ports:
    embedded: 20003
    rpc: 20001
    web: 20002
  # JVM options specific to the jobMaster container
  jvmOptions:

# Alluxio supports journal type of UFS and EMBEDDED
# UFS journal with HDFS example
# journal:
#   type: "UFS"
#   folder: "hdfs://{$hostname}:{$hostport}/journal"
# EMBEDDED journal to /journal example
# journal:
#   type: "EMBEDDED"
#   folder: "/journal"
journal:
  type: "UFS" # "UFS" or "EMBEDDED"
  ufsType: "local" # Ignored if type is "EMBEDDED". "local" or "HDFS"
  folder: "/journal" # Master journal folder
  # volumeType controls the type of journal volume.
  # It can be "persistentVolumeClaim" or "emptyDir"
  volumeType: emptyDir
  size: 1Gi
  # Attributes to use when the journal is persistentVolumeClaim
  storageClass: "standard"
  accessModes:
    - ReadWriteOnce
  # Attributes to use when the journal is emptyDir
  medium: ""
  # Configuration for journal formatting job
  format:
    runFormat: false # Change to true to format journal
    job:
      activeDeadlineSeconds: 30
      ttlSecondsAfterFinished: 10
    resources:
      limits:
        cpu: "4"
        memory: "8G"
      requests:
        cpu: "1"
        memory: "1G"


# You can enable metastore to use ROCKS DB instead of Heap
# metastore:
#   volumeType: persistentVolumeClaim # Options: "persistentVolumeClaim" or "emptyDir"
#   size: 1Gi
#   mountPath: /metastore
# # Attributes to use when the metastore is persistentVolumeClaim
#   storageClass: "standard"
#   accessModes:
#    - ReadWriteOnce
# # Attributes to use when the metastore is emptyDir
#   medium: ""


## Worker ##

worker:
  args:
    - worker-only
    - --no-format
  enableLivenessProbe: false
  enableReadinessProbe: false
  # Properties for the worker component
  properties:
  resources:
    limits:
      cpu: "4"
      memory: "4G"
    requests:
      cpu: "1"
      memory: "2G"
  ports:
    rpc: 29999
    web: 30000
  hostPID: true
  hostNetwork: true
  # dnsPolicy will be ClusterFirstWithHostNet if hostNetwork: true
  # and ClusterFirst if hostNetwork: false
  # You can specify dnsPolicy here to override this inference
  # dnsPolicy: ClusterFirst
  # JVM options specific to the worker container
  jvmOptions:
  nodeSelector:
    alluxio: 'true'

jobWorker:
  args:
    - job-worker
  enableLivenessProbe: false
  enableReadinessProbe: false
  # Properties for the jobWorker component
  properties:
  resources:
    limits:
      cpu: "4"
      memory: "4G"
    requests:
      cpu: "1"
      memory: "1G"
  ports:
    rpc: 30001
    data: 30002
    web: 30003
  # JVM options specific to the jobWorker container
  jvmOptions:

# Tiered Storage
# emptyDir example
#  - level: 0
#    alias: MEM
#    mediumtype: MEM
#    path: /dev/shm
#    type: emptyDir
#    quota: 1G
#
# hostPath example
#  - level: 0
#    alias: MEM
#    mediumtype: MEM
#    path: /dev/shm
#    type: hostPath
#    quota: 1G
#
# persistentVolumeClaim example
#  - level: 1
#    alias: SSD
#    mediumtype: SSD
#    type: persistentVolumeClaim
#    name: alluxio-ssd
#    path: /dev/ssd
#    quota: 10G
#
# multi-part mediumtype example
#  - level: 1
#    alias: SSD,HDD
#    mediumtype: SSD,HDD
#    type: persistentVolumeClaim
#    name: alluxio-ssd,alluxio-hdd
#    path: /dev/ssd,/dev/hdd
#    quota: 10G,10G
tieredstore:
  levels:
    - level: 0
      alias: HDD
      mediumtype: HDD-0
      path: /mnt/disk1
      type: persistentVolumeClaim
      name: lvm-pvc-im
      quota: 12000G
      high: 0.95
      low: 0.7

# Short circuit related properties
shortCircuit:
  enabled: true
  # The policy for short circuit can be "local" or "uuid",
  # local means the cache directory is in the same mount namespace,
  # uuid means interact with domain socket
  policy: uuid
  # volumeType controls the type of shortCircuit volume.
  # It can be "persistentVolumeClaim" or "hostPath"
  volumeType: hostPath
  size: 1Mi
  # Attributes to use if the domain socket volume is PVC
  pvcName: alluxio-worker-domain-socket
  accessModes:
    - ReadWriteOnce
  storageClass: standard
  # Attributes to use if the domain socket volume is hostPath
  hostPath: "/tmp/alluxio-domain" # The hostPath directory to use


## FUSE ##

fuse:
  image: registry-vpc.cn-beijing.aliyuncs.com/alluxio/alluxio-fuse
  imageTag: 2.3.0
  imagePullPolicy: IfNotPresent
  # Change both to true to deploy FUSE
  enabled: false
  clientEnabled: false
  # Properties for the jobWorker component
  properties:
  # Customize the MaxDirectMemorySize
  # These options are specific to the FUSE daemon
  jvmOptions:
    - "-XX:MaxDirectMemorySize=2g"
  hostNetwork: true
  hostPID: true
  dnsPolicy: ClusterFirstWithHostNet
  user: 0
  group: 0
  fsGroup: 0
  args:
    - fuse
    - --fuse-opts=allow_other
  # Mount path in the host
  mountPath: /mnt/alluxio-fuse
  resources:
    requests:
      cpu: "0.5"
      memory: "1G"
    limits:
      cpu: "4"
      memory: "4G"
  nodeSelector:
    alluxio: 'true'


##  Secrets ##

# Format: (<name>:<mount path under /secrets/>):
# secrets:
#   master: # Shared by master and jobMaster containers
#     alluxio-hdfs-config: hdfsConfig
#   worker: # Shared by worker and jobWorker containers
#     alluxio-hdfs-config: hdfsConfig
注意
  • 需使用上述配置的PVC配置tieredstore
    tieredstore:
      levels:
        - level: 0
          alias: HDD
          mediumtype: HDD-0
          path: /mnt/disk1
          type: persistentVolumeClaim
          name: lvm-pvc-im
          quota: 12000G
          high: 0.95
          low: 0.7
  • Alluxio需要和PVC在同一个namspace下。