Skip to content

Replace redirected urls with their targets#4557

Merged
carlosalberto merged 9 commits intoopen-telemetry:mainfrom
trask:replace-redirects
Jun 16, 2025
Merged

Replace redirected urls with their targets#4557
carlosalberto merged 9 commits intoopen-telemetry:mainfrom
trask:replace-redirects

Conversation

@trask
Copy link
Copy Markdown
Member

@trask trask commented Jun 13, 2025

Related to @chalin's #4554

We could enforce no redirects via lychee's max_redirects = 0 configuration, but we'd need to make a few exclusions for that to work, and it would only make our link check failures even more common as 3rd party sites move things around. Probably a better option to address @chalin's specific ask would be to have a separate lychee run with max_redirects = 0 that only checks https://opentelemetry.io links.

Most of this was done using:

python script
import os
import re
import requests
import concurrent.futures

def update_links_in_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()

    def replacer(match):
        url = match.group(2)
        new_url = get_redirect_url(url)
        if new_url and new_url != url:
            return f'[{match.group(1)}]({new_url})'
        return match.group(0)

    def replacer_ref(match):
        url = match.group(2)
        new_url = get_redirect_url(url)
        if new_url and new_url != url:
            return f'[{match.group(1)}]: {new_url}'
        return match.group(0)

    def replacer_html(match):
        url = match.group(1)
        new_url = get_redirect_url(url)
        if new_url and new_url != url:
            return f'href="{new_url}"'
        return match.group(0)

    # Markdown link: [text](https://...)
    pattern = re.compile(
        r'\['               # opening square bracket for the text
        r'([^]]+)'          # group 1: text
        r']'                # closing square bracket for the text
        r'\('               # opening parenthesis for the URL
        r'(https://[^)]+)'  # group 2: URL
        r'\)'               # closing parenthesis for the URL
    )
    new_content = pattern.sub(replacer, content)

    # Markdown link: reference-style [label]: https://...
    pattern = re.compile(
        r'\['               # opening square bracket for the ref
        r'([^]]+)'          # group 1: ref
        r']: '              # closing square bracket for the ref
        r'(https://.*)'  # group 2: URL
    )
    new_content = pattern.sub(replacer_ref, new_content)

    # Markdown link: html
    pattern = re.compile(
        r'href="'
        r'(https://[^"]+)'
        r'"'
    )
    new_content = pattern.sub(replacer_html, new_content)

    if new_content != content:
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(new_content)
        print(f'Updated: {filepath}')

def get_redirect_url(url):
    if url.startswith('https://cloud-native.slack.com/archives/'):
        # keep these short links as they are
        return None
    try:
        resp = requests.head(url, allow_redirects=True, timeout=5)
        if resp.history and resp.status_code == 200:
            for r in resp.history:
                if r.status_code == 301 or r.status_code == 302:
                    if resp.url.startswith('https://en.wikipedia.org'):
                        return resp.url.replace('https://en.wikipedia.org', 'https://wikipedia.org')
                    if resp.url.startswith('https://github.com/login?return_to') or resp.url.startswith('https://accounts.google.com/v3/signin/'):
                        # this link requires authentication, so we can't do anything with it
                        return None
                    if resp.url.startswith('http://arxiv.org'):
                        return resp.url.replace('http://arxiv.org', 'https://arxiv.org')
                    if resp.url.startswith('https://pkg.go.dev/'):
                        # no need for this query parameter
                        return re.sub(r'\?utm_source=godoc(?=#|$)', '', resp.url)
                    return resp.url
    except Exception:
        pass
    return None


filepaths = []

for dirpath, _, filenames in os.walk('.'):
    if 'node_modules' in dirpath.split(os.sep):
        continue
    for filename in filenames:
        if filename == 'CHANGELOG.md':
            continue
        if filename.endswith('.md'):
            filepaths.append(os.path.join(dirpath, filename))
            # update_links_in_file(os.path.join(dirpath, filename))

with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(update_links_in_file, filepaths)

Comment thread README.md Outdated
Comment thread issue-management.md Outdated
Comment thread oteps/0199-support-elastic-common-schema-in-opentelemetry.md Outdated
Comment thread specification/context/env-carriers.md Outdated
@trask trask marked this pull request as ready for review June 13, 2025 03:35
@trask trask requested review from a team June 13, 2025 03:35
Copy link
Copy Markdown
Contributor

@chalin chalin left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great!

@chalin
Copy link
Copy Markdown
Contributor

chalin commented Jun 13, 2025

@trask - might it be useful to commit that helper script in one of the OTel repos?

@chalin
Copy link
Copy Markdown
Contributor

chalin commented Jun 13, 2025

If so, @trask, please let me know where it lands.

@carlosalberto carlosalberto enabled auto-merge June 16, 2025 15:26
@carlosalberto carlosalberto added this pull request to the merge queue Jun 16, 2025
Merged via the queue into open-telemetry:main with commit 7730934 Jun 16, 2025
6 checks passed
@arminru arminru added the editorial Editorial changes only (typos, changelog, ...). No content-related changes of any kind. label Jun 17, 2025
github-merge-queue Bot pushed a commit that referenced this pull request Jun 17, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

editorial Editorial changes only (typos, changelog, ...). No content-related changes of any kind.

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants