AiSOC/services/agents/tests/test_adversary_eval.py at main · prince30121/AiSOC · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
"""
Pillar-1 Evaluation: AI-vs-AI Adversary — Graceful-Degradation Gate (w2-aivai)
==============================================================================
Sixth CI-gated eval suite. Loads the deterministic adversary-mutated dataset
produced by `scripts/generate_adversary_incidents.py` and grades the same
keyword tactic extractor that powers `test_mitre_accuracy.py`.

The dataset rewrites every defender-known keyword into evasive synonyms,
character obfuscation, and fragmentation across three intensity buckets:

    heavy   ≈ 45%   every keyword swapped
    medium  ≈ 35%   one expected tactic preserved cleanly
    light   ≈ 20%   light leetspeak only (control bucket)

Two regression floors are enforced:

  * **Overall catch rate ≥ 0.40** — under heavy adversarial mutation the
    defender is expected to drop ~50pp from its 0.95 baseline. The floor
    of 0.40 keeps "graceful degradation" honest: the substrate must still
    catch a meaningful fraction, but not anywhere near the unmutated
    baseline.
  * **Light-bucket catch rate ≥ 0.85** — light-tier obfuscation is just
    leetspeak; if the defender silently fails this bucket too, then a
    heavy-tier failure is hiding a deeper regression.

Both floors are intentionally below the 0.80 floor of the unmutated
suite. The point of this suite is NOT to prove the defender is great
under adversarial pressure — it's to prove the defender doesn't fall
off a cliff and to give us a measurable graceful-degradation curve we
can watch over time.

Run:
    pytest services/agents/tests/test_adversary_eval.py -v
    # or via the public eval harness:
    python scripts/run_evals.py --out eval_report.json
"""

from __future__ import annotations

import json
import unittest
from pathlib import Path
from typing import Any

from .test_mitre_accuracy import extract_tactics_from_text  # type: ignore

_TESTS_DIR = Path(__file__).parent
_ADVERSARY_PATH = _TESTS_DIR / "eval_data" / "adversary_incidents.json"
_BASE_PATH = _TESTS_DIR / "eval_data" / "synthetic_incidents.json"


# Regression floors. See module docstring for rationale.
_OVERALL_FLOOR = 0.40
_LIGHT_BUCKET_FLOOR = 0.85
# Heavy-tier upper bound: if the heavy bucket starts catching too much,
# either the mutation grammar has drifted off the keyword catalogue or the
# defender has silently widened its substring matches. Either way it
# means the "adversarial" dataset isn't actually adversarial anymore.
_HEAVY_BUCKET_CEILING = 0.50


class AdversaryEvalResult:
    def __init__(self) -> None:
        self.total = 0
        self.correct = 0
        self.bucket_counts: dict[str, int] = {"heavy": 0, "medium": 0, "light": 0}
        self.bucket_correct: dict[str, int] = {"heavy": 0, "medium": 0, "light": 0}
        self.lost_all_tactics = 0
        self.per_tactic_lost: dict[str, int] = {}
        self.details: list[dict[str, Any]] = []

    @property
    def accuracy(self) -> float:
        return self.correct / self.total if self.total else 0.0

    def bucket_accuracy(self, bucket: str) -> float:
        n = self.bucket_counts.get(bucket, 0)
        return (self.bucket_correct.get(bucket, 0) / n) if n else 0.0

    def to_summary(self) -> dict[str, Any]:
        return {
            "incidents": self.total,
            "correct": self.correct,
            "accuracy": round(self.accuracy, 4),
            "lost_all_tactics": self.lost_all_tactics,
            "buckets": {
                b: {
                    "incidents": self.bucket_counts[b],
                    "correct": self.bucket_correct[b],
                    "accuracy": round(self.bucket_accuracy(b), 4),
                }
                for b in ("heavy", "medium", "light")
            },
            "per_tactic_lost": dict(sorted(self.per_tactic_lost.items())),
        }


def _load_adversary_dataset() -> list[dict[str, Any]]:
    if not _ADVERSARY_PATH.exists():
        raise FileNotFoundError(
            f"Adversary dataset missing at {_ADVERSARY_PATH}. Generate with: python3 scripts/generate_adversary_incidents.py"
        )
    return json.loads(_ADVERSARY_PATH.read_text())


def evaluate_adversary_accuracy() -> AdversaryEvalResult:
    """Run the keyword tactic extractor against the mutated dataset.

    Same scoring rule as `test_mitre_accuracy.py`: a case is correct if
    the predicted tactic set overlaps the expected set by at least one
    tactic. The point is graceful-degradation, not zero-error detection.
    """
    incidents = _load_adversary_dataset()
    result = AdversaryEvalResult()

    for inc in incidents:
        result.total += 1
        bucket = inc.get("adversary_intensity", "heavy")
        result.bucket_counts[bucket] = result.bucket_counts.get(bucket, 0) + 1

        expected = set(inc.get("expected_tactics", []))
        text = f"{inc['title']}\n{inc['description']}"
        predicted = extract_tactics_from_text(text)
        overlap = predicted & expected
        correct = bool(overlap)

        if correct:
            result.correct += 1
            result.bucket_correct[bucket] = result.bucket_correct.get(bucket, 0) + 1
        else:
            result.lost_all_tactics += 1

        for t in expected - predicted:
            result.per_tactic_lost[t] = result.per_tactic_lost.get(t, 0) + 1

        result.details.append(
            {
                "incident_id": inc.get("id"),
                "template_id": inc.get("template_id"),
                "adversary_intensity": bucket,
                "expected": sorted(expected),
                "predicted": sorted(predicted),
                "overlap": sorted(overlap),
                "correct": correct,
            }
        )

    return result


# ---------------------------------------------------------------------------
# pytest tests
# ---------------------------------------------------------------------------


class TestAdversaryEval(unittest.TestCase):
    """Sixth CI suite — graceful-degradation under adversarial mutation."""

    def test_dataset_present(self) -> None:
        self.assertTrue(
            _ADVERSARY_PATH.exists(),
            f"Adversary dataset missing at {_ADVERSARY_PATH}. Run scripts/generate_adversary_incidents.py to (re)generate it.",
        )
        # The mutated set must mirror the base set 1:1 so per-template
        # diffs are meaningful.
        base = json.loads(_BASE_PATH.read_text())
        mutated = json.loads(_ADVERSARY_PATH.read_text())
        self.assertEqual(
            len(base),
            len(mutated),
            f"Adversary dataset size {len(mutated)} != base dataset size {len(base)}",
        )

    def test_dataset_is_actually_mutated(self) -> None:
        """Make sure the generator actually changed the text — not a no-op.

        Some templates legitimately contain no defender keyword the grammar
        knows about (and the light bucket only applies leetspeak), so a
        meaningful fraction of the corpus will pass through unchanged. The
        floor here just guards against the grammar collapsing to a no-op.
        """
        mutated = _load_adversary_dataset()
        unchanged = sum(
            1 for inc in mutated if inc["title"] == inc.get("original_title") and inc["description"] == inc.get("original_description")
        )
        self.assertLess(
            unchanged,
            (len(mutated) * 35) // 100,
            f"{unchanged}/{len(mutated)} incidents unchanged — mutation grammar may have regressed.",
        )

    def test_overall_graceful_degradation(self) -> None:
        result = evaluate_adversary_accuracy()
        print(
            f"\n[eval] Adversary catch rate: {result.correct}/{result.total} = "
            f"{result.accuracy * 100:.1f}% "
            f"(heavy={result.bucket_accuracy('heavy') * 100:.1f}%, "
            f"medium={result.bucket_accuracy('medium') * 100:.1f}%, "
            f"light={result.bucket_accuracy('light') * 100:.1f}%)"
        )
        self.assertGreaterEqual(
            result.accuracy,
            _OVERALL_FLOOR,
            f"Adversary catch rate {result.accuracy:.1%} below "
            f"graceful-degradation floor of {_OVERALL_FLOOR:.0%}.\n" + json.dumps(result.to_summary(), indent=2)[:4000],
        )

    def test_light_bucket_still_caught(self) -> None:
        """Light-tier obfuscation is leetspeak only — defender should pass."""
        result = evaluate_adversary_accuracy()
        light_acc = result.bucket_accuracy("light")
        self.assertGreaterEqual(
            light_acc,
            _LIGHT_BUCKET_FLOOR,
            f"Light-bucket adversary accuracy {light_acc:.1%} below "
            f"control floor of {_LIGHT_BUCKET_FLOOR:.0%}. "
            "Defender keyword extractor may have regressed.",
        )

    def test_heavy_bucket_actually_evades(self) -> None:
        """Heavy-tier mutation must actually hurt the defender.

        If heavy catches too much, the dataset isn't adversarial anymore
        — either the grammar has regressed or the defender has silently
        widened its substring matches.
        """
        result = evaluate_adversary_accuracy()
        heavy_acc = result.bucket_accuracy("heavy")
        self.assertLessEqual(
            heavy_acc,
            _HEAVY_BUCKET_CEILING,
            f"Heavy-bucket adversary accuracy {heavy_acc:.1%} above "
            f"adversariality ceiling of {_HEAVY_BUCKET_CEILING:.0%}. "
            "Mutation grammar isn't actually evading detection — "
            "synonyms may be leaking defender keywords.",
        )

    def test_bucket_distribution(self) -> None:
        """Heavy bucket must be substantial — otherwise we're not testing it."""
        result = evaluate_adversary_accuracy()
        self.assertGreater(
            result.bucket_counts["heavy"],
            result.total // 4,
            f"Heavy bucket only {result.bucket_counts['heavy']}/{result.total} — mutation distribution may have drifted.",
        )
        self.assertGreater(
            result.bucket_counts["light"],
            0,
            "Light bucket is empty — no control sample.",
        )


if __name__ == "__main__":
    unittest.main()