agent-ecosystem
diff --git a/‎DEVELOPMENT.md‎
Lines changed: 66 additions & 5 deletions b/‎DEVELOPMENT.md‎
Lines changed: 66 additions & 5 deletions
diff --git a/‎README.md‎
Lines changed: 13 additions & 7 deletions b/‎README.md‎
Lines changed: 13 additions & 7 deletions
diff --git a/‎src/checks/authentication/auth-gate-detection.ts‎
Lines changed: 198 additions & 5 deletions b/‎src/checks/authentication/auth-gate-detection.ts‎
Lines changed: 198 additions & 5 deletions
@@ -69,13 +69,18 @@ src/
     registry.ts    # Check registration and lookup
     index.ts       # Side-effect imports that register all checks
   cli/             # CLI entry point and formatters
-  helpers/         # Shared utilities (HTTP, markdown detection, etc.)
+  helpers/         # Shared utilities
+    get-page-urls.ts       # Page discovery (llms.txt, sitemap) and sampling
+    get-markdown-content.ts # Shared markdown fetching (cached or standalone)
+    detect-markdown.ts     # Heuristics for identifying markdown content
+    to-md-urls.ts          # Generate .md URL candidates from a page URL
+    html-to-markdown.ts    # HTML → markdown conversion
   runner.ts        # Orchestrates check execution with dependency resolution
   types.ts         # Shared type definitions
   http.ts          # Rate-limited HTTP client
 test/
-  unit/            # Unit tests (mocked HTTP via MSW)
-  integration/     # Integration tests (spawns the CLI binary)
+  unit/            # Unit tests (one check or helper at a time, mocked HTTP)
+  integration/     # Integration tests (CLI binary + cross-check pipelines)
   fixtures/        # Shared test fixtures
 bin/
   afdocs.mjs       # CLI binary entry point
@@ -126,6 +131,28 @@ const {
 
 This handles the full discovery chain (llms.txt links, sitemap, baseUrl fallback) and Fisher-Yates shuffles down to `maxLinksToTest` when needed.
 
+The result is **cached on `ctx._sampledPages`** so that all checks within a single run share the same sampled page list. This ensures consistent results: if markdown-url-support tests pages A, B, C, then content-negotiation, page-size-html, http-status-codes, and every other check that calls `discoverAndSamplePages` will test the same pages A, B, C. Do not bypass this caching by calling `getPageUrls` directly unless your check genuinely needs a different page set.
+
+### Getting markdown content
+
+Checks that analyze markdown content (page size, code fences, content parity, etc.) should use the shared `getMarkdownContent` helper from `src/helpers/get-markdown-content.ts`:
+
+```ts
+import { getMarkdownContent } from '../../helpers/get-markdown-content.js';
+
+const mdResult = await getMarkdownContent(ctx);
+// mdResult.mode === 'cached' when markdown-url-support or content-negotiation ran
+// mdResult.mode === 'standalone' when neither ran (fetches markdown independently)
+// mdResult.pages contains MarkdownPage[] with url, content, and source
+```
+
+This helper handles two scenarios:
+
+- **Cached mode**: When `markdown-url-support` or `content-negotiation` has run, reads from `ctx.pageCache`. Also checks whether the dependency passed (some pages had markdown) or failed (no markdown found).
+- **Standalone mode**: When neither dependency ran (e.g. user ran `--checks page-size-markdown` alone), discovers pages and fetches markdown independently.
+
+In both modes, llms.txt content from `llms-txt-exists` results is included automatically. The `source` field on each page indicates its origin (`'md-url'`, `'content-negotiation'`, `'standalone-md-url'`, `'standalone-content-negotiation'`, or `'llms-txt'`).
+
 ### Check dependencies and standalone mode
 
 Checks can declare dependencies via `dependsOn`. The runner resolves these so that, for example, `page-size-markdown` can read cached markdown from `markdown-url-support` and `content-negotiation`.
@@ -147,7 +174,21 @@ When a user runs a single check with `--checks`, its dependencies may not have e
 
 3. **Ensure parity.** A standalone check should produce the same results as when it runs as part of the full suite. If standalone mode discovers pages differently (fewer URLs, different sources), users will see inconsistent results depending on which checks they run.
 
-See `page-size-markdown.ts` for a concrete example: it reads from `ctx.pageCache` when dependencies ran, and falls back to `discoverAndSamplePages` with its own markdown fetching when they didn't.
+See `page-size-markdown.ts` for a concrete example: it uses `getMarkdownContent()`, which reads from `ctx.pageCache` when dependencies ran and falls back to independent fetching when they didn't.
+
+### Shared state between checks
+
+Checks communicate through three mechanisms on `CheckContext`:
+
+| Mechanism         | Written by                                    | Read by                          | Purpose                                        |
+| ----------------- | --------------------------------------------- | -------------------------------- | ---------------------------------------------- |
+| `previousResults` | Runner (after each check)                     | Any downstream check             | Check status, details (e.g. `discoveredFiles`) |
+| `pageCache`       | `markdown-url-support`, `content-negotiation` | `getMarkdownContent()` consumers | Cached markdown content keyed by page URL      |
+| `_sampledPages`   | `discoverAndSamplePages` (first call)         | All subsequent callers           | Ensures consistent page sampling across checks |
+
+When a check reads from `previousResults`, it creates an implicit ordering dependency. If your check reads from another check's results, either declare it in `dependsOn` or handle the case where it hasn't run. For example, `cache-header-hygiene` reads llms.txt URLs from `llms-txt-exists` results but doesn't declare it as a dependency; it gracefully falls back to an empty list.
+
+`content-negotiation` guards against overwriting `pageCache` entries that `markdown-url-support` already populated: `if (!ctx.pageCache.has(url))`. This ensures the `.md` URL version takes precedence when both mechanisms find markdown for the same page.
 
 ### Testing checks with dependencies
 
@@ -191,7 +232,13 @@ server.use(
 
 ## Testing
 
-Tests use [Vitest](https://vitest.dev/) with [MSW](https://mswjs.io/) (Mock Service Worker) for HTTP mocking. The typical pattern:
+Tests use [Vitest](https://vitest.dev/) with [MSW](https://mswjs.io/) (Mock Service Worker) for HTTP mocking. There are two levels of tests:
+
+### Unit tests (`test/unit/`)
+
+Each check gets its own test file at `test/unit/checks/<check-id>.test.ts`. These test a single check in isolation by manually constructing a `CheckContext` with the expected `previousResults` and `pageCache` state.
+
+The typical pattern:
 
 ```ts
 import { setupServer } from 'msw/node';
@@ -230,6 +277,20 @@ Use unique hostnames per test (e.g. `http://my-check-pass.local/...`) to avoid M
 
 Set `requestDelay: 0` in test contexts to avoid artificial delays.
 
+### Cross-check integration tests (`test/integration/check-pipeline.test.ts`)
+
+These tests run multiple real checks through the runner and verify that data flows correctly between them. Unlike unit tests (which manually set up context), pipeline tests exercise the actual check execution order, `pageCache` population, `previousResults` propagation, and shared sampling.
+
+**When to update `check-pipeline.test.ts`:**
+
+- **Adding a check that reads from `previousResults` or `pageCache`**: Add a test verifying it receives the expected data from its upstream checks, and that it handles the "upstream didn't run" case.
+- **Adding a check that writes to `pageCache`**: Add a test verifying downstream consumers see the cached data.
+- **Changing `dependsOn` declarations**: Add a test covering the new dependency chain (skip when dep fails, standalone when dep absent).
+- **Adding a check that calls `discoverAndSamplePages`**: Add it to the "shared sampling" test to verify it samples the same pages as other checks.
+- **Changing shared helpers** (`getMarkdownContent`, `discoverAndSamplePages`, etc.): Run the pipeline tests to verify cross-check behavior is preserved.
+
+The pipeline tests use a `setupSite` helper to configure a mock docs site with llms.txt, HTML pages, .md URLs, and content-negotiation support, then run a subset of checks via `runChecks()` with `checkIds`.
+
 ## Known issues
 
 ### Node.js 25 localStorage warning
 
@@ -7,7 +7,7 @@ Test your documentation site against the [Agent-Friendly Documentation Spec](htt
 
 Agents don't use docs like humans. They hit truncation limits, get walls of CSS instead of content, can't follow cross-host redirects, and don't know about quality-of-life improvements like `llms.txt` or `.md` docs pages that would make life swell. Maybe this is because the industry has lacked guidance - until now.
 
-afdocs runs 21 checks across 8 categories to evaluate how well your docs serve agent consumers. 10 are fully implemented; the rest return `skip` until completed.
+afdocs runs 21 checks across 8 categories to evaluate how well your docs serve agent consumers. 14 are fully implemented; the rest return `skip` until completed.
 
 > **Status: Early development (0.x)**
 > This project is under active development. Check IDs, CLI flags, and output formats may change between minor versions. Feel free to try it out, but don't build automation against specific output until 1.0.
@@ -36,8 +36,14 @@ Markdown Availability
   ✗ content-negotiation: Server ignores Accept: text/markdown header (0/50 sampled pages return markdown)
   ✗ markdown-url-support: No sampled pages support .md URLs (0/50 tested)
 
+URL Stability
+  ✓ http-status-codes: All 50 sampled pages return proper error codes for bad URLs
+
+Authentication
+  ✓ auth-gate-detection: All 50 sampled pages are publicly accessible
+
 Summary
-  5 passed, 2 failed, 14 skipped (21 total)
+  9 passed, 3 failed, 9 skipped (21 total)
 ```
 
 ## Install
@@ -138,7 +144,7 @@ describe('agent-friendliness', () => {
 
 ## Checks
 
-21 checks across 8 categories. Checks marked with \* are stub implementations that return `skip`.
+21 checks across 8 categories. Checks marked with \* are not yet implemented and return `skip`.
 
 ### Category 1: llms.txt
 
@@ -171,13 +177,13 @@ describe('agent-friendliness', () => {
 | --------------------------------- | -------------------------------------------------- |
 | `tabbed-content-serialization` \* | Whether tabbed content creates oversized output    |
 | `section-header-quality` \*       | Whether headers in tabbed sections include context |
-| `markdown-code-fence-validity` \* | Whether markdown has unclosed code fences          |
+| `markdown-code-fence-validity`    | Whether markdown has unclosed code fences          |
 
 ### Category 5: URL Stability and Redirects
 
 | Check                  | Description                                     |
 | ---------------------- | ----------------------------------------------- |
-| `http-status-codes` \* | Whether error pages return correct status codes |
+| `http-status-codes`    | Whether error pages return correct status codes |
 | `redirect-behavior` \* | Whether redirects are same-host HTTP redirects  |
 
 ### Category 6: Agent Discoverability Directives
@@ -192,13 +198,13 @@ describe('agent-friendliness', () => {
 | ---------------------------- | ---------------------------------------------- |
 | `llms-txt-freshness` \*      | Whether `llms.txt` reflects current site state |
 | `markdown-content-parity` \* | Whether markdown and HTML versions match       |
-| `cache-header-hygiene` \*    | Whether cache headers allow timely updates     |
+| `cache-header-hygiene`       | Whether cache headers allow timely updates     |
 
 ### Category 8: Authentication and Access
 
 | Check                        | Description                                                          |
 | ---------------------------- | -------------------------------------------------------------------- |
-| `auth-gate-detection` \*     | Whether documentation pages require authentication to access content |
+| `auth-gate-detection`        | Whether documentation pages require authentication to access content |
 | `auth-alternative-access` \* | Whether auth-gated sites provide alternative access paths for agents |
 
 ## Check dependencies
 
@@ -1,12 +1,205 @@
 import { registerCheck } from '../registry.js';
+import { discoverAndSamplePages } from '../../helpers/get-page-urls.js';
 import type { CheckContext, CheckResult } from '../../types.js';
 
-async function check(_ctx: CheckContext): Promise<CheckResult> {
+type PageClassification = 'accessible' | 'auth-required' | 'soft-auth-gate' | 'auth-redirect';
+
+interface AuthResult {
+  url: string;
+  classification: PageClassification;
+  status: number | null;
+  redirectUrl?: string;
+  ssoDomain?: string;
+  hint?: string;
+  error?: string;
+}
+
+const SSO_DOMAINS = [
+  'okta.com',
+  'auth0.com',
+  'login.microsoftonline.com',
+  'accounts.google.com',
+  'login.salesforce.com',
+  'sso.',
+  'idp.',
+  'auth.',
+  'login.',
+];
+
+function isSsoDomain(url: string): string | undefined {
+  try {
+    const hostname = new URL(url).hostname.toLowerCase();
+    return SSO_DOMAINS.find(
+      (domain) =>
+        hostname === domain || hostname.endsWith('.' + domain) || hostname.startsWith(domain),
+    );
+  } catch {
+    return undefined;
+  }
+}
+
+function detectLoginForm(body: string): string | undefined {
+  const sample = body.slice(0, 50000).toLowerCase();
+
+  if (sample.includes('<input') && sample.includes('type="password"')) {
+    return 'Contains password input field';
+  }
+
+  // Check page title for login indicators
+  const titleMatch = /<title[^>]*>(.*?)<\/title>/i.exec(sample);
+  if (titleMatch) {
+    const title = titleMatch[1].toLowerCase();
+    if (/sign\s*in|log\s*in|authenticate/i.test(title)) {
+      return `Page title suggests login: "${titleMatch[1].trim()}"`;
+    }
+  }
+
+  // Check for SSO form actions
+  if (/<form[^>]*action\s*=\s*["'][^"']*(?:saml|oauth|openid|sso|auth)[^"']*["']/i.test(sample)) {
+    return 'Contains SSO-related form action';
+  }
+
+  return undefined;
+}
+
+async function check(ctx: CheckContext): Promise<CheckResult> {
+  const id = 'auth-gate-detection';
+  const category = 'authentication';
+
+  const { urls: pageUrls, totalPages, sampled, warnings } = await discoverAndSamplePages(ctx);
+
+  const results: AuthResult[] = [];
+  const concurrency = ctx.options.maxConcurrency;
+
+  for (let i = 0; i < pageUrls.length; i += concurrency) {
+    const batch = pageUrls.slice(i, i + concurrency);
+    const batchResults = await Promise.all(
+      batch.map(async (url): Promise<AuthResult> => {
+        try {
+          const response = await ctx.http.fetch(url, { redirect: 'manual' });
+          const status = response.status;
+
+          // Auth-required status codes
+          if (status === 401 || status === 403) {
+            return { url, classification: 'auth-required', status };
+          }
+
+          // Redirect — check if it's to an SSO domain
+          if (status >= 300 && status < 400) {
+            const location = response.headers.get('location');
+            if (location) {
+              const resolvedLocation = location.startsWith('http')
+                ? location
+                : new URL(location, url).toString();
+              const ssoDomain = isSsoDomain(resolvedLocation);
+              if (ssoDomain) {
+                return {
+                  url,
+                  classification: 'auth-redirect',
+                  status,
+                  redirectUrl: resolvedLocation,
+                  ssoDomain,
+                };
+              }
+            }
+            // Non-SSO redirect — treat as accessible (normal redirect)
+            return { url, classification: 'accessible', status };
+          }
+
+          // 200 — check for soft auth gate (login form)
+          if (status === 200) {
+            let body: string;
+            try {
+              body = await response.text();
+            } catch {
+              return { url, classification: 'accessible', status };
+            }
+
+            const loginHint = detectLoginForm(body);
+            if (loginHint) {
+              return { url, classification: 'soft-auth-gate', status, hint: loginHint };
+            }
+
+            return { url, classification: 'accessible', status };
+          }
+
+          // Other status codes — treat as accessible
+          return { url, classification: 'accessible', status };
+        } catch (err) {
+          return {
+            url,
+            classification: 'accessible',
+            status: null,
+            error: err instanceof Error ? err.message : String(err),
+          };
+        }
+      }),
+    );
+    results.push(...batchResults);
+  }
+
+  const fetchErrors = results.filter((r) => r.error).length;
+  const tested = results.filter((r) => !r.error);
+
+  if (tested.length === 0) {
+    return {
+      id,
+      category,
+      status: 'fail',
+      message: `Could not fetch any pages to check authentication${fetchErrors > 0 ? `; ${fetchErrors} failed to fetch` : ''}`,
+      details: {
+        totalPages,
+        testedPages: results.length,
+        sampled,
+        fetchErrors,
+        pageResults: results,
+        discoveryWarnings: warnings,
+      },
+    };
+  }
+
+  const accessible = tested.filter((r) => r.classification === 'accessible');
+  const authRequired = tested.filter((r) => r.classification === 'auth-required');
+  const softAuthGate = tested.filter((r) => r.classification === 'soft-auth-gate');
+  const authRedirect = tested.filter((r) => r.classification === 'auth-redirect');
+  const gatedCount = authRequired.length + softAuthGate.length + authRedirect.length;
+
+  const ssoDomains = [...new Set(authRedirect.map((r) => r.ssoDomain).filter(Boolean) as string[])];
+
+  let status: 'pass' | 'warn' | 'fail';
+  let message: string;
+  const suffix = fetchErrors > 0 ? `; ${fetchErrors} failed to fetch` : '';
+  const pageLabel = sampled ? 'sampled pages' : 'pages';
+
+  if (gatedCount === 0) {
+    status = 'pass';
+    message = `All ${accessible.length} ${pageLabel} are publicly accessible${suffix}`;
+  } else if (accessible.length > 0 && gatedCount > 0) {
+    status = 'warn';
+    message = `${gatedCount} of ${tested.length} ${pageLabel} require authentication (${accessible.length} accessible)${suffix}`;
+  } else {
+    status = 'fail';
+    message = `All ${tested.length} ${pageLabel} require authentication${suffix}`;
+  }
+
   return {
-    id: 'auth-gate-detection',
-    category: 'authentication',
-    status: 'skip',
-    message: 'Not yet implemented',
+    id,
+    category,
+    status,
+    message,
+    details: {
+      totalPages,
+      testedPages: results.length,
+      sampled,
+      accessible: accessible.length,
+      authRequired: authRequired.length,
+      softAuthGate: softAuthGate.length,
+      authRedirect: authRedirect.length,
+      ssoDomains,
+      fetchErrors,
+      pageResults: results,
+      discoveryWarnings: warnings,
+    },
   };
 }