VASA-1-hack/channel_config.yaml at nemo · johndpope/VASA-1-hack · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
model:
  model_dim: 512
  max_seq_len: 60
  clip_bounds:
    min: -100.0
    max: 100.0

dimensions:
  prev_expression: 64
  prev_audio: 64
  audio_features: 512  # Aligned with JoyVASA's feature_dim

landmarks:
  lips:
    points: 20
    coords: 3
  right_eye:
    points: 8
    coords: 3
  left_eye:
    points: 7
    coords: 3
  jaw:
    points: 10
    coords: 3
  nose:
    points: 4
    coords: 3

channel_layout:
  # Previous context will be handled directly through transformer attention,
  # not compressed into the condition embedding

  # Current window conditions
  audio_features:
    size: ${dimensions.audio_features}
    description: "Current audio features"

  gaze:
    size: 2  # pitch and yaw
    description: "Gaze direction angles"

  head_distance:
    size: 1
    description: "Head distance from camera"

  emotion:
    size: 2  # valence and arousal
    description: "Emotion parameters"

  speed_bucket:
    size: 1
    description: "Motion speed category"

  # Landmark features
  lips:
    size: ${landmarks.lips.points}*${landmarks.lips.coords}
    description: "Lip landmark coordinates"

  right_eye:
    size: ${landmarks.right_eye.points}*${landmarks.right_eye.coords}
    description: "Right eye landmark coordinates"

  left_eye:
    size: ${landmarks.left_eye.points}*${landmarks.left_eye.coords}
    description: "Left eye landmark coordinates"

  jaw:
    size: ${landmarks.jaw.points}*${landmarks.jaw.coords}
    description: "Jaw landmark coordinates"

  nose:
    size: ${landmarks.nose.points}*${landmarks.nose.coords}
    description: "Nose landmark coordinates"

  blink_state:
    size: 32
    description: "Blink state embedding"

  padding:
    size: 149
    description: "Padding to match model_dim (512 - 363 = 149)"

# Audio model dimensions
audio_models:
  wav2vec:
    input_dim: 768
    hidden_dim: 256
    output_dim: ${dimensions.audio_features}
  whisper:
    input_dim: 384
    hidden_dim: 256
    output_dim: ${dimensions.audio_features}

# Reference whichever model you want to use
projections:
  audio: ${audio_models.wav2vec}  # Change to wav2vec to use wav2vec features
  # use_talkvid_audio_projection: true  # Set to true to use TalkVid-style Perceiver architecture instead of JoyVASA linear layer

  blink:
    input_dim: 3
    hidden_dim: 32
    output_dim: 32

  motion_norm_dim: 256
  control_norm_dim: 5  # Updated from 6 to 5 (removed speed_bucket: gaze:2 + distance:1 + emotion:2)
  landmark_norm_dim: ${sum:${landmarks.lips.points}*${landmarks.lips.coords},
                          ${landmarks.right_eye.points}*${landmarks.right_eye.coords},
                          ${landmarks.left_eye.points}*${landmarks.left_eye.coords},
                          ${landmarks.jaw.points}*${landmarks.jaw.coords},
                          ${landmarks.nose.points}*${landmarks.nose.coords}}