Skip to content

Commit 06dac5d

Browse files
authored
chaos(engine): enable chaos test in dataflow and add two tests. (pingcap#5641)
ref pingcap#5640
1 parent 2ccee24 commit 06dac5d

File tree

14 files changed

+455
-76
lines changed

14 files changed

+455
-76
lines changed
Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
name: Dataflow Engine Chaos
2+
3+
on:
4+
schedule:
5+
- cron: '0 17-23 * * *' # run at minute 0 every hour from 01:00 ~ 07:00 UTC+8
6+
7+
# See: https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#concurrency.
8+
concurrency:
9+
group: ${{ github.ref }}-${{ github.workflow }}
10+
cancel-in-progress: true
11+
12+
# A workflow run is made up of one or more jobs that can run sequentially or in parallel
13+
jobs:
14+
# This workflow contains a single job called "base"
15+
base:
16+
# The type of runner that the job will run on
17+
runs-on: ubuntu-18.04
18+
timeout-minutes: 30
19+
strategy:
20+
fail-fast: false
21+
matrix:
22+
chaos-obj:
23+
[
24+
"pod-failure-dataflow",
25+
"pod-kill-dataflow",
26+
]
27+
28+
# Steps represent a sequence of tasks that will be executed as part of the job
29+
steps:
30+
- uses: actions/checkout@v2
31+
32+
- uses: actions/setup-go@v3
33+
with:
34+
go-version: 1.18
35+
36+
- name: Cache go modules
37+
uses: actions/cache@v2
38+
with:
39+
path: ~/go/pkg/mod
40+
key: ${{ runner.os }}-dataflow-${{ hashFiles('go.sum') }}
41+
42+
# Set up Kubernetes with K3s
43+
- name: Set up K3s cluster
44+
run: |
45+
curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=v1.18.9+k3s1 sh -s - \
46+
--write-kubeconfig-mode=644 \
47+
"${k3s_disable_command:---disable}" metrics-server \
48+
"${k3s_disable_command:---disable}" traefik \
49+
--flannel-backend=none \
50+
--docker
51+
shell: bash
52+
53+
# this may be failed sometimes, and I want to exit the workflow directly if failed,
54+
# but GitHub Actions doesnt' support early-exit yet, see https://github.com/actions/runner/issues/662.
55+
# so, simply wait for a long time.
56+
- name: Wait for coredns
57+
run: |
58+
kubectl rollout status --watch --timeout 600s deployment/coredns -n kube-system
59+
shell: bash
60+
env:
61+
KUBECONFIG: /etc/rancher/k3s/k3s.yaml
62+
63+
- name: Export KUBECONFIG environment variable
64+
run: |
65+
echo 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml' >> $GITHUB_ENV
66+
shell: bash
67+
68+
- name: Print cluster information
69+
run: |
70+
kubectl config view
71+
kubectl cluster-info
72+
kubectl get nodes
73+
kubectl get pods -n kube-system
74+
kubectl get sc
75+
kubectl version
76+
77+
- name: Build dataflow engine binary
78+
run: make df-master df-executor df-chaos-case
79+
80+
- name: Build Dataflow engine docker image
81+
run: |
82+
cp -r $GITHUB_WORKSPACE/engine/chaos/manifests/conf/ $GITHUB_WORKSPACE/bin/
83+
docker build -f $GITHUB_WORKSPACE/engine/chaos/manifests/Dockerfile -t dataflow:chaos $GITHUB_WORKSPACE/bin
84+
docker image list
85+
86+
# Set up metastore and basic services
87+
- name: Set up metastore and basic services
88+
run: |
89+
kubectl apply -f $GITHUB_WORKSPACE/engine/chaos/manifests/metastore.yaml
90+
kubectl get -f $GITHUB_WORKSPACE/engine/chaos/manifests/metastore.yaml
91+
kubectl describe -f $GITHUB_WORKSPACE/engine/chaos/manifests/metastore.yaml
92+
- name: Wait for metastore ready
93+
run: |
94+
kubectl wait --for=condition=Ready pod/metastore-framework-mysql-0 --timeout=60s || true
95+
kubectl wait --for=condition=Ready pod/metastore-user-etcd-0 --timeout=60s || true
96+
97+
echo show pvc
98+
kubectl get pvc -l app=metastore -o wide
99+
echo show pv
100+
kubectl get pv -o wide
101+
echo show svc
102+
kubectl get svc -l app=metastore -o wide
103+
echo show sts
104+
kubectl get sts -l app=metastore -o wide
105+
echo show po
106+
kubectl get po -l app=metastore -o wide
107+
echo describe po
108+
kubectl describe po -l app=metastore
109+
echo describe pvc
110+
kubectl describe pvc -l app=metastore
111+
kubectl wait --for=condition=Ready pod/metastore-framework-mysql-0 --timeout=0s
112+
kubectl wait --for=condition=Ready pod/metastore-user-etcd-0 --timeout=0s
113+
114+
- name: Set up server-master
115+
run: |
116+
kubectl apply -f $GITHUB_WORKSPACE/engine/chaos/manifests/server-master.yaml
117+
kubectl get -f $GITHUB_WORKSPACE/engine/chaos/manifests/server-master.yaml
118+
kubectl describe -f $GITHUB_WORKSPACE/engine/chaos/manifests/server-master.yaml
119+
120+
- name: Wait for server-master ready
121+
run: |
122+
kubectl wait --for=condition=Ready pod -l app=server-master --all --timeout=60s|| true
123+
echo "<<<<< show pvc >>>>>"
124+
kubectl get pvc -l app=server-master -o wide
125+
echo "<<<<< show pv >>>>>"
126+
kubectl get pv -o wide
127+
echo "<<<<< show svc >>>>>"
128+
kubectl get svc -l app=server-master -o wide
129+
echo "<<<<< show sts >>>>>"
130+
kubectl get sts -l app=server-master -o wide
131+
echo "<<<<< show po >>>>>"
132+
kubectl get po -l app=server-master -o wide
133+
echo "<<<<< describe po >>>>>"
134+
kubectl describe po -l app=server-master
135+
echo "<<<<< describe pvc >>>>>"
136+
kubectl describe pvc -l app=server-master
137+
echo "<<<<< show current log for server-master-0 >>>>>"
138+
kubectl logs server-master-0 || true
139+
echo "<<<<< show previous log for server-master-0 >>>>>"
140+
kubectl logs server-master-0 -p || true
141+
echo "<<<<< show current log for server-master-1 >>>>>"
142+
kubectl logs server-master-1 || true
143+
echo "<<<<< show previous log for server-master-1 >>>>>"
144+
kubectl logs server-master-1 -p || true
145+
echo "<<<<< show current log for server-master-2 >>>>>"
146+
kubectl logs server-master-2 || true
147+
echo "<<<<< show previous log for server-master-2 >>>>>"
148+
kubectl logs server-master-2 -p || true
149+
150+
- name: Set up executor
151+
run: |
152+
kubectl apply -f $GITHUB_WORKSPACE/engine/chaos/manifests/executor.yaml
153+
kubectl get -f $GITHUB_WORKSPACE/engine/chaos/manifests/executor.yaml
154+
kubectl describe -f $GITHUB_WORKSPACE/engine/chaos/manifests/executor.yaml
155+
156+
- name: Wait for executor ready
157+
run: |
158+
kubectl wait --for=condition=Ready pod -l app=executor --all --timeout=60s|| true
159+
echo "<<<<< show pvc >>>>>"
160+
kubectl get pvc -l app=executor -o wide
161+
echo "<<<<< show pv >>>>>"
162+
kubectl get pv -o wide
163+
echo "<<<<< show svc >>>>>"
164+
kubectl get svc -l app=executor -o wide
165+
echo "<<<<< show sts >>>>>"
166+
kubectl get sts -l app=executor -o wide
167+
echo "<<<<< show po >>>>>"
168+
kubectl get po -l app=executor -o wide
169+
echo "<<<<< describe po >>>>>"
170+
kubectl describe po -l app=executor
171+
echo "<<<<< describe pvc >>>>>"
172+
kubectl describe pvc -l app=executor
173+
echo "<<<<< show current log for executor-0 >>>>>"
174+
kubectl logs executor-0 || true
175+
echo "<<<<< show previous log for executor-0 >>>>>"
176+
kubectl logs executor-0 -p || true
177+
echo "<<<<< show current log for executor-1 >>>>>"
178+
kubectl logs executor-1 || true
179+
echo "<<<<< show previous log for worker-master-1 >>>>>"
180+
kubectl logs executor-1 -p || true
181+
echo "<<<<< show current log for executor-2 >>>>>"
182+
kubectl logs executor-2 || true
183+
echo "<<<<< show previous log for executor-2 >>>>>"
184+
kubectl logs executor-2 -p || true
185+
186+
- name: Set up chaos test cases
187+
run: |
188+
kubectl apply -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml
189+
kubectl get -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml
190+
kubectl describe -f $GITHUB_WORKSPACE/engine/chaos/manifests/cases.yaml
191+
192+
- name: Encode chaos-mesh action
193+
run: |
194+
echo CFG_BASE64=$(base64 -w 0 $GITHUB_WORKSPACE/engine/chaos/manifests/${{ matrix.chaos-obj }}.yaml) >> $GITHUB_ENV
195+
196+
- name: Run chaos mesh action
197+
uses: chaos-mesh/chaos-mesh-action@master
198+
env:
199+
CFG_BASE64: ${{ env.CFG_BASE64 }}
200+
CHAOS_MESH_VERSION: v1.0.0
201+
202+
# check whether complete with 1m * 20 times.
203+
- name: Wait for chaos test case complete
204+
run: |
205+
$GITHUB_WORKSPACE/engine/chaos/scripts/check-case.sh
206+
207+
- name: Copy logs to hack permission
208+
if: ${{ always() }}
209+
run: |
210+
mkdir ./logs
211+
sudo cp -r -L /var/log/containers/. ./logs
212+
sudo find /var/ -type f | grep -E '.*/(server-master|executor)-[^/]*.log$' | sudo xargs -i cp {} ./logs || true
213+
sudo chown -R runner ./logs
214+
215+
# Upload logs as artifact seems not stable, so we set `continue-on-error: true` here.
216+
- name: Upload logs
217+
continue-on-error: true
218+
uses: actions/upload-artifact@v2
219+
if: ${{ always() }}
220+
with:
221+
name: chaos-base-logs.${{ matrix.chaos-obj }}
222+
path: |
223+
./logs
224+
!./logs/coredns-*
225+
!./logs/local-path-provisioner-*
226+
227+
# Send feishu notification if failed.
228+
- name: Feishu notification
229+
continue-on-error: true
230+
uses: foxundermoon/feishu-action@v2
231+
if: ${{ failure() }}
232+
with:
233+
url: ${{ secrets.ENGINE_FEISHU_NOTIFY_URL }}
234+
msg_type: text
235+
content: |
236+
text: |
237+
dataflow engine chaos job runs, see https://github.com/pingcap/tiflow/actions/runs/${{ github.run_id }}
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
// Copyright 2022 PingCAP, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
package main
15+
16+
import (
17+
"context"
18+
"encoding/json"
19+
"errors"
20+
"fmt"
21+
"time"
22+
23+
"go.uber.org/zap"
24+
25+
"github.com/pingcap/tiflow/dm/pkg/log"
26+
pb "github.com/pingcap/tiflow/engine/enginepb"
27+
"github.com/pingcap/tiflow/engine/lib/fake"
28+
"github.com/pingcap/tiflow/engine/test/e2e"
29+
"github.com/pingcap/tiflow/pkg/util"
30+
)
31+
32+
func runFakeJobCase(ctx context.Context, cfg *config) error {
33+
serverMasterEndpoints := []string{cfg.MasterAddr}
34+
etcdEndpoints := []string{cfg.EtcdAddr}
35+
36+
jobCfg := &fake.Config{
37+
JobName: "fake-job-case",
38+
WorkerCount: 8,
39+
// use a large enough target tick to ensure the fake job long running
40+
TargetTick: 10000000,
41+
EtcdWatchEnable: true,
42+
EtcdEndpoints: etcdEndpoints,
43+
EtcdWatchPrefix: "/fake-job/test/",
44+
}
45+
e2eCfg := &e2e.FakeJobConfig{
46+
EtcdEndpoints: etcdEndpoints, // reuse user meta KV endpoints
47+
WorkerCount: jobCfg.WorkerCount,
48+
KeyPrefix: jobCfg.EtcdWatchPrefix,
49+
}
50+
cli, err := e2e.NewUTCli(ctx, serverMasterEndpoints, etcdEndpoints, e2eCfg)
51+
if err != nil {
52+
return err
53+
}
54+
revision, err := cli.GetRevision(ctx)
55+
if err != nil {
56+
return err
57+
}
58+
jobCfg.EtcdStartRevision = revision
59+
cfgBytes, err := json.Marshal(jobCfg)
60+
if err != nil {
61+
return err
62+
}
63+
64+
// create a fake job
65+
jobID, err := cli.CreateJob(ctx, pb.JobType_FakeJob, cfgBytes)
66+
if err != nil {
67+
return err
68+
}
69+
70+
// update upstream etcd, and check fake job works normally every 60 seconds
71+
// run 10 times, about 10 minutes totally.
72+
mvcc := 0
73+
interval := 60 * time.Second
74+
runTime := 10
75+
for i := 0; i < runTime; i++ {
76+
value := fmt.Sprintf("update-value-index-%d", i)
77+
mvcc++
78+
start := time.Now()
79+
err := updateKeyAndCheck(ctx, cli, jobID, jobCfg.WorkerCount, value, mvcc)
80+
if err != nil {
81+
return err
82+
}
83+
duration := time.Since(start)
84+
log.L().Info("update key and check test", zap.Int("round", i), zap.Duration("duration", duration))
85+
if duration < interval {
86+
time.Sleep(start.Add(interval).Sub(time.Now()))
87+
}
88+
}
89+
90+
log.L().Info("run fake job case successfully")
91+
92+
return nil
93+
}
94+
95+
func updateKeyAndCheck(
96+
ctx context.Context, cli *e2e.ChaosCli, jobID string, workerCount int,
97+
updateValue string, expectedMvcc int,
98+
) error {
99+
for i := 0; i < workerCount; i++ {
100+
err := cli.UpdateFakeJobKey(ctx, i, updateValue)
101+
if err != nil {
102+
return err
103+
}
104+
}
105+
finished := util.WaitSomething(60, time.Second*5, func() bool {
106+
for jobIdx := 0; jobIdx < workerCount; jobIdx++ {
107+
err := cli.CheckFakeJobKey(ctx, jobID, jobIdx, expectedMvcc, updateValue)
108+
if err != nil {
109+
log.L().Warn("check fail job failed", zap.Error(err))
110+
return false
111+
}
112+
}
113+
return true
114+
})
115+
if !finished {
116+
return errors.New("wait fake job normally timeout")
117+
}
118+
return nil
119+
}

0 commit comments

Comments
 (0)