|
| 1 | +name: Dataflow Engine Chaos |
| 2 | + |
| 3 | +on: |
| 4 | + schedule: |
| 5 | + - cron: '0 17-23 * * *' # run at minute 0 every hour from 01:00 ~ 07:00 UTC+8 |
| 6 | + |
| 7 | +# See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency. |
| 8 | +concurrency: |
| 9 | + group: ${{ github.ref }}-${{ github.workflow }} |
| 10 | + cancel-in-progress: true |
| 11 | + |
| 12 | +# A workflow run is made up of one or more jobs that can run sequentially or in parallel |
| 13 | +jobs: |
| 14 | + # This workflow contains a single job called "base" |
| 15 | + base: |
| 16 | + # The type of runner that the job will run on |
| 17 | + runs-on: ubuntu-18.04 |
| 18 | + timeout-minutes: 30 |
| 19 | + strategy: |
| 20 | + fail-fast: false |
| 21 | + matrix: |
| 22 | + chaos-obj: |
| 23 | + [ |
| 24 | + "pod-failure-dataflow", |
| 25 | + "pod-kill-dataflow", |
| 26 | + ] |
| 27 | + |
| 28 | + # Steps represent a sequence of tasks that will be executed as part of the job |
| 29 | + steps: |
| 30 | + - uses: actions/checkout@v2 |
| 31 | + |
| 32 | + - uses: actions/setup-go@v3 |
| 33 | + with: |
| 34 | + go-version: 1.18 |
| 35 | + |
| 36 | + - name: Cache go modules |
| 37 | + uses: actions/cache@v2 |
| 38 | + with: |
| 39 | + path: ~/go/pkg/mod |
| 40 | + key: ${{ runner.os }}-dataflow-${{ hashFiles('go.sum') }} |
| 41 | + |
| 42 | + # Set up Kubernetes with K3s |
| 43 | + - name: Set up K3s cluster |
| 44 | + run: | |
| 45 | + curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=v1.18.9+k3s1 sh -s - \ |
| 46 | + --write-kubeconfig-mode=644 \ |
| 47 | + "${k3s_disable_command:---disable}" metrics-server \ |
| 48 | + "${k3s_disable_command:---disable}" traefik \ |
| 49 | + --flannel-backend=none \ |
| 50 | + --docker |
| 51 | + shell: bash |
| 52 | + |
| 53 | + # this may be failed sometimes, and I want to exit the workflow directly if failed, |
| 54 | + # but GitHub Actions doesnt' support early-exit yet, see https://github.com/actions/runner/issues/662. |
| 55 | + # so, simply wait for a long time. |
| 56 | + - name: Wait for coredns |
| 57 | + run: | |
| 58 | + kubectl rollout status --watch --timeout 600s deployment/coredns -n kube-system |
| 59 | + shell: bash |
| 60 | + env: |
| 61 | + KUBECONFIG: /etc/rancher/k3s/k3s.yaml |
| 62 | + |
| 63 | + - name: Export KUBECONFIG environment variable |
| 64 | + run: | |
| 65 | + echo 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml' >> $GITHUB_ENV |
| 66 | + shell: bash |
| 67 | + |
| 68 | + - name: Print cluster information |
| 69 | + run: | |
| 70 | + kubectl config view |
| 71 | + kubectl cluster-info |
| 72 | + kubectl get nodes |
| 73 | + kubectl get pods -n kube-system |
| 74 | + kubectl get sc |
| 75 | + kubectl version |
| 76 | +
|
| 77 | + - name: Build dataflow engine binary |
| 78 | + run: make df-master df-executor df-chaos-case |
| 79 | + |
| 80 | + - name: Build Dataflow engine docker image |
| 81 | + run: | |
| 82 | + cp -r $GITHUB_WORKSPACE/engine/chaos/manifests/conf/ $GITHUB_WORKSPACE/bin/ |
| 83 | + docker build -f $GITHUB_WORKSPACE/engine/chaos/manifests/Dockerfile -t dataflow:chaos $GITHUB_WORKSPACE/bin |
| 84 | + docker image list |
| 85 | +
|
| 86 | + # Set up metastore and basic services |
| 87 | + - name: Set up metastore and basic services |
| 88 | + run: | |
| 89 | + kubectl apply -f $GITHUB_WORKSPACE/engine/chaos/manifests/metastore.yaml |
| 90 | + kubectl get -f $GITHUB_WORKSPACE/engine/chaos/manifests/metastore.yaml |
| 91 | + kubectl describe -f $GITHUB_WORKSPACE/engine/chaos/manifests/metastore.yaml |
| 92 | + - name: Wait for metastore ready |
| 93 | + run: | |
| 94 | + kubectl wait --for=condition=Ready pod/metastore-framework-mysql-0 --timeout=60s || true |
| 95 | + kubectl wait --for=condition=Ready pod/metastore-user-etcd-0 --timeout=60s || true |
| 96 | +
|
| 97 | + echo show pvc |
| 98 | + kubectl get pvc -l app=metastore -o wide |
| 99 | + echo show pv |
| 100 | + kubectl get pv -o wide |
| 101 | + echo show svc |
| 102 | + kubectl get svc -l app=metastore -o wide |
| 103 | + echo show sts |
| 104 | + kubectl get sts -l app=metastore -o wide |
| 105 | + echo show po |
| 106 | + kubectl get po -l app=metastore -o wide |
| 107 | + echo describe po |
| 108 | + kubectl describe po -l app=metastore |
| 109 | + echo describe pvc |
| 110 | + kubectl describe pvc -l app=metastore |
| 111 | + kubectl wait --for=condition=Ready pod/metastore-framework-mysql-0 --timeout=0s |
| 112 | + kubectl wait --for=condition=Ready pod/metastore-user-etcd-0 --timeout=0s |
| 113 | +
|
| 114 | + - name: Set up server-master |
| 115 | + run: | |
| 116 | + kubectl apply -f $GITHUB_WORKSPACE/engine/chaos/manifests/server-master.yaml |
| 117 | + kubectl get -f $GITHUB_WORKSPACE/engine/chaos/manifests/server-master.yaml |
| 118 | + kubectl describe -f $GITHUB_WORKSPACE/engine/chaos/manifests/server-master.yaml |
| 119 | +
|
| 120 | + - name: Wait for server-master ready |
| 121 | + run: | |
| 122 | + kubectl wait --for=condition=Ready pod -l app=server-master --all --timeout=60s|| true |
| 123 | + echo "<<<<< show pvc >>>>>" |
| 124 | + kubectl get pvc -l app=server-master -o wide |
| 125 | + echo "<<<<< show pv >>>>>" |
| 126 | + kubectl get pv -o wide |
| 127 | + echo "<<<<< show svc >>>>>" |
| 128 | + kubectl get svc -l app=server-master -o wide |
| 129 | + echo "<<<<< show sts >>>>>" |
| 130 | + kubectl get sts -l app=server-master -o wide |
| 131 | + echo "<<<<< show po >>>>>" |
| 132 | + kubectl get po -l app=server-master -o wide |
| 133 | + echo "<<<<< describe po >>>>>" |
| 134 | + kubectl describe po -l app=server-master |
| 135 | + echo "<<<<< describe pvc >>>>>" |
| 136 | + kubectl describe pvc -l app=server-master |
| 137 | + echo "<<<<< show current log for server-master-0 >>>>>" |
| 138 | + kubectl logs server-master-0 || true |
| 139 | + echo "<<<<< show previous log for server-master-0 >>>>>" |
| 140 | + kubectl logs server-master-0 -p || true |
| 141 | + echo "<<<<< show current log for server-master-1 >>>>>" |
| 142 | + kubectl logs server-master-1 || true |
| 143 | + echo "<<<<< show previous log for server-master-1 >>>>>" |
| 144 | + kubectl logs server-master-1 -p || true |
| 145 | + echo "<<<<< show current log for server-master-2 >>>>>" |
| 146 | + kubectl logs server-master-2 || true |
| 147 | + echo "<<<<< show previous log for server-master-2 >>>>>" |
| 148 | + kubectl logs server-master-2 -p || true |
| 149 | +
|
| 150 | + - name: Set up executor |
| 151 | + run: | |
| 152 | + kubectl apply -f $GITHUB_WORKSPACE/engine/chaos/manifests/executor.yaml |
| 153 | + kubectl get -f $GITHUB_WORKSPACE/engine/chaos/manifests/executor.yaml |
| 154 | + kubectl describe -f $GITHUB_WORKSPACE/engine/chaos/manifests/executor.yaml |
| 155 | +
|
| 156 | + - name: Wait for executor ready |
| 157 | + run: | |
| 158 | + kubectl wait --for=condition=Ready pod -l app=executor --all --timeout=60s|| true |
| 159 | + echo "<<<<< show pvc >>>>>" |
| 160 | + kubectl get pvc -l app=executor -o wide |
| 161 | + echo "<<<<< show pv >>>>>" |
| 162 | + kubectl get pv -o wide |
| 163 | + echo "<<<<< show svc >>>>>" |
| 164 | + kubectl get svc -l app=executor -o wide |
| 165 | + echo "<<<<< show sts >>>>>" |
| 166 | + kubectl get sts -l app=executor -o wide |
| 167 | + echo "<<<<< show po >>>>>" |
| 168 | + kubectl get po -l app=executor -o wide |
| 169 | + echo "<<<<< describe po >>>>>" |
| 170 | + kubectl describe po -l app=executor |
| 171 | + echo "<<<<< describe pvc >>>>>" |
| 172 | + kubectl describe pvc -l app=executor |
| 173 | + echo "<<<<< show current log for executor-0 >>>>>" |
| 174 | + kubectl logs executor-0 || true |
| 175 | + echo "<<<<< show previous log for executor-0 >>>>>" |
| 176 | + kubectl logs executor-0 -p || true |
| 177 | + echo "<<<<< show current log for executor-1 >>>>>" |
| 178 | + kubectl logs executor-1 || true |
| 179 | + echo "<<<<< show previous log for worker-master-1 >>>>>" |
| 180 | + kubectl logs executor-1 -p || true |
| 181 | + echo "<<<<< show current log for executor-2 >>>>>" |
| 182 | + kubectl logs executor-2 || true |
| 183 | + echo "<<<<< show previous log for executor-2 >>>>>" |
| 184 | + kubectl logs executor-2 -p || true |
| 185 | +
|
| 186 | + - name: Set up chaos test cases |
| 187 | + run: | |
| 188 | + kubectl apply -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml |
| 189 | + kubectl get -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml |
| 190 | + kubectl describe -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml |
| 191 | +
|
| 192 | + - name: Encode chaos-mesh action |
| 193 | + run: | |
| 194 | + echo CFG_BASE64=$(base64 -w 0 $GITHUB_WORKSPACE/engine/chaos/manifests/${{ matrix.chaos-obj }}.yaml) >> $GITHUB_ENV |
| 195 | +
|
| 196 | + - name: Run chaos mesh action |
| 197 | + uses: chaos-mesh/chaos-mesh-action@master |
| 198 | + env: |
| 199 | + CFG_BASE64: ${{ env.CFG_BASE64 }} |
| 200 | + CHAOS_MESH_VERSION: v1.0.0 |
| 201 | + |
| 202 | + # check whether complete with 1m * 20 times. |
| 203 | + - name: Wait for chaos test case complete |
| 204 | + run: | |
| 205 | + $GITHUB_WORKSPACE/engine/chaos/scripts/check-case.sh |
| 206 | +
|
| 207 | + - name: Copy logs to hack permission |
| 208 | + if: ${{ always() }} |
| 209 | + run: | |
| 210 | + mkdir ./logs |
| 211 | + sudo cp -r -L /var/log/containers/. ./logs |
| 212 | + sudo find /var/ -type f | grep -E '.*/(server-master|executor)-[^/]*.log$' | sudo xargs -i cp {} ./logs || true |
| 213 | + sudo chown -R runner ./logs |
| 214 | +
|
| 215 | + # Upload logs as artifact seems not stable, so we set `continue-on-error: true` here. |
| 216 | + - name: Upload logs |
| 217 | + continue-on-error: true |
| 218 | + uses: actions/upload-artifact@v2 |
| 219 | + if: ${{ always() }} |
| 220 | + with: |
| 221 | + name: chaos-base-logs.${{ matrix.chaos-obj }} |
| 222 | + path: | |
| 223 | + ./logs |
| 224 | + !./logs/coredns-* |
| 225 | + !./logs/local-path-provisioner-* |
| 226 | +
|
| 227 | + # Send feishu notification if failed. |
| 228 | + - name: Feishu notification |
| 229 | + continue-on-error: true |
| 230 | + uses: foxundermoon/feishu-action@v2 |
| 231 | + if: ${{ failure() }} |
| 232 | + with: |
| 233 | + url: ${{ secrets.ENGINE_FEISHU_NOTIFY_URL }} |
| 234 | + msg_type: text |
| 235 | + content: | |
| 236 | + text: | |
| 237 | + dataflow engine chaos job runs, see https://github.com/pingcap/tiflow/actions/runs/${{ github.run_id }} |
0 commit comments