Skip to content

Commit de7163f

Browse files
committed
feat(Tactic/Linter): lint unwanted unicode (#36773)
Extends the text-based style linter that checks all unicode characters. Provides automatic replacements for some disallowed characters. Unicode is very versatile and useful for Lean and Mathlib. However, it is also very complex and few people have a thorough understanding of all its pitfalls (I don't claim to be one of them). In order to avoid unpleasant surprises going forward, both accidental and malicious, we should keep track of which Unicode characters are allowed in Mathlib. In programming and cybersecurity, there are many known [issues and attacks concerning unicode](https://en.wikipedia.org/wiki/Unicode#Security_issues). Many open source repositories have been hit by such attacks, which are becoming ever more frequent due to the use of automation using e.g. large language models. Some notable ones: - [homograph attacks](https://en.wikipedia.org/wiki/IDN_homograph_attack): confusion caused by use of distinct characters which look the same (we probably don't want to fully address this and this PR does not attempt to) - [Trojan Source](https://en.wikipedia.org/wiki/Trojan_Source): abuse of bidirectional characters. Characters used by languages with right-to-left reading direction can cause code to be displayed differently than it is parsed. (This PR tries to address this) - Exploits involving Private Use Area characters, e.g. [GlassWorm](https://abit.ee/en/cybersecurity/viruses-trojans-and-other-malware/glassworm-github-supply-chain-attack-unicode-solana-malware-npm-vs-code-2026-en). (This PR tries to address this) See also [unicode code source handling](https://www.unicode.org/reports/tr55/) and [Programming with Unicode](https://unicodebook.readthedocs.io/nightmare.html) for further details and guidelines. Co-authored-by: Michael Rothgang @grunweg Co-authored-by: Jon Eugster @joneugster
1 parent fa6418a commit de7163f

File tree

8 files changed

+272
-84
lines changed

8 files changed

+272
-84
lines changed

Mathlib/Data/String/Defs.lean

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@ This file defines a bunch of functions for the `String` datatype.
1414

1515
@[expose] public section
1616

17+
/-- Determine if a unicode character is part of the ASCII subset.
18+
19+
ASCII characters have code points in the range U+0000-U+007F.
20+
(Note: [the Unicode standard](https://www.unicode.org/Public/draft/charts/CodeCharts.pdf)) does
21+
not mention ASCII and instead calls this subset "C0 Controls and Basic Latin".)
22+
-/
23+
def Char.isAscii (c : Char) : Bool := c.toNat < 0x80
24+
1725
namespace String
1826

1927
/-- Pad `s : String` with repeated occurrences of `c : Char` until it's of length `n`.

Mathlib/RingTheory/Spectrum/Prime/Polynomial.lean

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ Also see `AlgebraicGeometry/AffineSpace` for the affine space over arbitrary sch
2525
- `Polynomial.exists_image_comap_of_monic`:
2626
If `g : R[X]` is monic, the image of `Z(g) ∩ D(f) : Spec R[X]` in `Spec R` is compact open.
2727
- `Polynomial.isOpenMap_comap_C`: The structure map `Spec R[X] → Spec R` is an open map.
28-
- `MvPolynomial.isOpenMap_comap_C`: The structure map `Spec R[X̲] → Spec R` is an open map.
28+
- `MvPolynomial.isOpenMap_comap_C`:
29+
The structure map `Spec (MvPolynomial σ R) → Spec R` is an open map.
2930
3031
-/
3132

Mathlib/Tactic/Linter/TextBased.lean

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,9 @@ def StyleError.errorMessage (err : StyleError) : String := match err with
9494
endings (\\n) instead"
9595
| trailingWhitespace => "This line ends with some whitespace: please remove this"
9696
| semicolon => "This line contains a space before a semicolon"
97-
| StyleError.unwantedUnicode c => s!"This line contains a bad unicode character \
98-
'{c}' ({c.printCodepointHex})."
97+
| StyleError.unwantedUnicode c => s!"This line contains a unicode character that is not on the \
98+
allowlist '{c}' ({c.printCodepointHex}). \
99+
For adding new symbols see `Mathlib.Linter.TextBased.UnicodeLinter.othersInMathlib`."
99100
| StyleError.unicodeVariant s selector =>
100101
let variantText := if selector == UnicodeVariant.emoji then
101102
"emoji"
@@ -226,7 +227,7 @@ def parse?_errorContext (line : String) : Option ErrorContext := Id.run do
226227
-- extract the offending unicode character from `errorMessage`
227228
-- (if the offending character is 'C', `errorMessage[7] == "'C'"` )
228229
-- and wrap it in the appropriate `StyleError`, which will print it as '+NNNN'
229-
let str ← errorMessage[7]?
230+
let str ← errorMessage[12]?
230231
let c ← String.Pos.Raw.get? str ⟨1-- take middle character of expected three
231232
StyleError.unwantedUnicode c
232233
| "ERR_UNICODE_VARIANT" => do

Mathlib/Tactic/Linter/TextBased/UnicodeLinter.lean

Lines changed: 153 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -39,59 +39,163 @@ public def String.printCodepointHex (s : String) : String :=
3939
-- note: must not contain spaces because of the error message parsing below!
4040
";".intercalate <| s.toList.map Char.printCodepointHex
4141

42+
/-- Allowed (by the linter) whitespace characters -/
43+
def ASCII.allowedWhitespace (c : Char) : Bool := #[' ', '\n'].contains c
44+
45+
/-- Printable ASCII characters, not including whitespace. -/
46+
def ASCII.printable (c : Char) : Bool := 0x21 ≤ c.toNat && c.toNat ≤ 0x7e
47+
48+
/-- Allowed (by the linter) ASCII characters -/
49+
def ASCII.allowed (c : Char) : Bool := ASCII.allowedWhitespace c || ASCII.printable c
50+
51+
/--
52+
Symbols with VSCode extension abbreviation, as of March 17, 2026.
53+
Obtained using `scripts/extract-unique-nonascii.lean` from
54+
https://github.com/leanprover/vscode-lean4/blob/132d329aa8afa3e8508ef77cfdcff112d3b35c88
55+
-/
56+
def withVSCodeAbbrev : Array Char := #[
57+
'⦃', '⦄', '⟦', '⟧', '⟨', '⟩', '⟮', '⟯', '⸨', '⸩', '‹', '›', '«', '»', '⁅', '⁆',
58+
'‖', '₊', '⌊', '⌋', '⌈', '⌉', '⦋', '⦌', 'α', 'β', 'χ', '↓', 'ε', 'γ', '∩', 'μ',
59+
'∘', 'Π', '▸', '→', '↑', '∨', '×', '⁻', '¹', '∼', '·', '⋆', '¬', '¿', '₁', '₂',
60+
'₃', '₄', '₅', '₆', '₇', '₈', '₉', '₀', '←', 'Ø', '⅋', '𝔸', 'ℂ', 'Δ', '𝔽', 'Γ',
61+
'ℍ', '⋂', '𝕂', 'Λ', 'ℕ', 'ℚ', 'ℝ', 'Σ', '⋃', 'ℤ', '♯', '∶', '∣', 'δ', 'ζ', 'η',
62+
'θ', 'ι', 'κ', 'λ', 'ν', 'ξ', 'π', 'ρ', 'ς', 'σ', 'τ', 'φ', 'ψ', 'ω', 'À', 'Á',
63+
'Â', 'Ã', 'Ä', 'Ā', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ē', 'Ì', 'Í', 'Î', 'Ï', 'Ī', 'Ñ',
64+
'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ō', 'Ù', 'Ú', 'Û', 'Ü', 'Ū', 'Ý', 'à', 'á', 'â', 'ã',
65+
'ä', 'ā', 'ç', 'è', 'é', 'ê', 'ë', 'ē', 'ì', 'í', 'î', 'ï', 'ī', 'ñ', 'ò', 'ó',
66+
'ô', 'õ', 'ö', 'ø', 'ō', 'ù', 'ú', 'û', 'ü', 'ū', 'ý', 'ÿ', 'Ł', '♩', '∉', '≮',
67+
'𐆎', '∌', '∋', '⟹', '♮', '₦', '∇', '≉', '№', '⇍', '⇎', '⇏', '⊯', '⊮', '≇', '↗',
68+
'≢', '≠', '∄', '≱', '≯', '↚', '↮', '≰', '∤', '∦', '⋠', '⊀', '↛', '≄', '≁', '⊈',
69+
'⊄', '⋡', '⊁', '⊉', '⊅', '⋬', '⋪', '⋭', '⋫', '⊭', '⊬', '↖', '≃', '≖', '≕', '⋝',
70+
'⋜', '⊢', '–', '∃', '∅', '—', '€', 'ℓ', '≅', '∈', '⊺', '∫', '⨍', '∆', '⊓', '⨅',
71+
'∞', '↔', 'ı', '≣', '≡', '≗', '⇒', '≋', '≊', '≈', '⟶', 'ϩ', '↩', '↪', '₴', 'ͱ',
72+
'♥', 'ℏ', '∻', '≔', '∺', '∷', '≂', '⊣', '²', '³', '∹', '─', '═', '━', '╌', '⊸',
73+
'≑', '≐', '∔', '∸', '⋯', '≘', '⟅', '≙', '∧', '∠', '∟', 'Å', '∀', 'ᶠ', 'ᵐ', 'ℵ',
74+
'⁎', '∗', '≍', '⌶', 'å', 'æ', '₳', '∐', '≚', 'ª', 'º', '⊕', 'ᵒ', 'ᵈ', 'ᵃ', 'ᵖ',
75+
'⊖', '⊝', '⊗', '⊘', '⊙', '⊚', '⊜', 'œ', '🛑', 'Ω', '℥', 'ο', '∮', '∯', '⨂', '∂',
76+
'≛', '≜', '▹', '▿', '⊴', '◃', '⊵', '▵', '⬝', '◂', '↞', '↠', '⁀', '∴', '℡', '₸',
77+
'♪', 'µ', '⁄', '฿', '✝', '⁒', '₡', '℗', '₩', '₱', '‱', '₤', '℞', '※', '‽', '℮',
78+
'◦', '₮', '⊤', 'ₛ', 'ₐ', 'ᵇ', 'ₗ', 'ₘ', 'ₚ', '⇨', '¬', '⋖', '⩿', '≝', '°', 'ϯ',
79+
'⊡', '₫', '⇊', '⇃', '⇂', '↘', '⇘', '₯', '↙', '⇙', '⇵', '↧', '⟱', '⇓', '↡', '‡',
80+
'⋱', '↯', '◆', '◇', '◈', '⚀', '÷', '⋇', '⌀', '♢', '⋄', 'ϝ', '†', 'ℸ', 'ð', '≞',
81+
'∡', '↦', '♂', '✠', '₼', 'ℐ', '−', '₥', '℧', '⊧', '∓', '≟', '⁇', '🛇', '∏', '∝',
82+
'≾', '≼', '⋨', '≺', '′', '↣', '𝒫', '£', '▰', '▱', '㉐', '¶', '∥', '±', '⟂', 'ᗮ',
83+
'‰', '⅌', '₧', '⋔', '≦', '≤', '↝', '↢', '↽', '↼', '⇇', '⇆', '⇋', '↭', '⋋', '≲',
84+
'⋚', '≶', '⊔', '⟷', '⇔', '⌟', '⟵', '↤', '⇚', '⇐', '↜', '⌞', '〚', '≪', '₾', '…',
85+
'“', '《', '⧏', '◁', '⋦', '≨', '↫', '↬', '✧', '‘', '⋉', '≧', '≥', '„', '‚', '₲',
86+
'ϫ', '⋙', '≫', 'ℷ', '⋧', '≩', '≳', '⋗', '⋛', '≷', '≴', '⟪', '≵', '⟫', '√', '✂',
87+
'⊂', '⊃', '⊏', '⊐', '⊆', '⊊', '⊇', '⊋', '⨆', '∛', '∜', '≿', '≽', '⋩', '≻', '∑',
88+
'⤳', '⋢', '⊑', '⋣', '⊒', '□', '⇝', '■', '▣', '▢', '◾', '✦', '✶', '✴', '✹', 'ϛ',
89+
'₷', '∙', '♠', '∢', '§', 'ϻ', 'ϡ', 'ϸ', 'ϭ', 'ϣ', '﹨', '∖', '⌣', '•', '◀', 'Τ',
90+
'Θ', 'Þ', '∪', '‿', '⯑', '⊎', '⊍', '↨', '↕', '⇕', '⇖', '⌜', '⇗', '⌝', '⇈', '⇅',
91+
'↥', '⟰', '⇑', '↟', 'υ', '↿', '↾', '⋀', 'Å', 'Æ', 'Α', '⋁', '⨁', '⨀', '⍟', 'Œ',
92+
'Ω', 'Ο', 'Ι', 'ℑ', '⨄', '⨃', 'Υ', 'ƛ', 'Ϫ', 'Β', 'Ε', 'Ζ', 'Κ', 'Μ', 'Ν', 'Ξ',
93+
'Ρ', 'Φ', 'Χ', 'Ψ', '✉', '⋘', '↰', '⊨', '⇰', '⊩', '⊫', '⊪', '⋒', '⋓', '¤', '⋞',
94+
'⋟', '⋎', '⋏', '↶', '↷', '♣', '🚧', 'ᶜ', '∁', '©', '●', '○', '◌', '◎', '◯', '↺',
95+
'↻', '®', 'Ⓢ', '⊛', '¢', '₵', '℃', 'ȩ', '✓', '₢', '☡', '∎', '⧸', '⊹', '⊞', '⊟',
96+
'⊠', '¦', 'ℙ', '𝔹', '⅀', '𝟘', '𝟙', '𝟚', '𝟛', '𝟜', '𝟝', '𝟞', '𝟟', '𝟠', '𝟡', '𝟬',
97+
'𝟭', '𝟮', '𝟯', '𝟰', '𝟱', '𝟲', '𝟳', '𝟴', '𝟵', '‣', '≏', '☣', '★', '▽', '△', 'ℶ',
98+
'≬', '∵', '≌', '∍', '‵', '⋍', '∽', '⊼', '▪', '☻', '▾', '▴', '⊥', '⋈', '◫', '⇉',
99+
'⇶', '⇄', '⇛', '▬', '▭', '⟆', '☢', '〛', '’', '⇁', '⇀', '⇌', '⋌', '≓', '₽', '₨',
100+
'▷', '⋊', '”', '》', '⥤', '½', '⅓', '¼', '⅕', '⅙', '⅛', '⅟', '⅔', '⅖', '¾', '⅗',
101+
'⅜', '⅘', '⅚', '⅝', '⅞', '⌢', '♀', 'ϥ', '℻', '≒', '♭', 'ℜ', '↱', 'Ϥ', '☹', 'Ϩ',
102+
'Ͱ', 'Ϧ', 'Ϟ', 'ᵉ', 'ᵍ', 'ʰ', 'ⁱ', 'ʲ', 'ᵏ', 'ˡ', 'ⁿ', 'ʳ', 'ˢ', 'ᵗ', 'ᵘ', 'ᵛ',
103+
'ʷ', 'ˣ', 'ʸ', 'ᶻ', 'ᴬ', 'ᴮ', 'ᴰ', 'ᴱ', 'ᴳ', 'ᴴ', 'ᴵ', 'ᴶ', 'ᴷ', 'ᴸ', 'ᴹ', 'ᴺ',
104+
'ᴼ', 'ᴾ', 'ᴿ', 'ᵀ', 'ᵁ', 'ⱽ', 'ᵂ', '⁰', '⁴', '⁵', '⁶', '⁷', '⁸', '⁹', '⁾', '⁽',
105+
'⁼', '⁺', 'ꭟ', 'ᶶ', 'ᶷ', 'ꭞ', 'ꭝ', 'ᶪ', 'ᶩ', 'ꟹ', 'ꭜ', 'ʱ', 'ꟸ', 'ᶿ', 'ᶺ', 'ᶭ',
106+
'ᵚ', 'ᶣ', 'ᶛ', 'ᵆ', 'ᵄ', 'ᵎ', 'ᵌ', 'ʵ', 'ʴ', 'ᶵ', 'ᵔ', 'ᶾ', 'ᶴ', 'ᶞ', 'ᵑ', 'ᶽ',
107+
'ᶼ', 'ᶹ', 'ᶦ', 'ᶫ', 'ᶰ', 'ᶸ', 'ᶧ', 'ʶ', 'ᶝ', 'ᵡ', 'ᶳ', 'ᶢ', 'ᵊ', 'ᵙ', 'ᶲ', 'ᶱ',
108+
'ᵝ', 'ᵕ', 'ᶯ', 'ᶮ', 'ᶬ', 'ᶨ', 'ᶥ', 'ᶤ', 'ᶟ', 'ˤ', 'ᵠ', 'ᵞ', 'ˠ', 'ᵜ', 'ᵅ', 'ᵓ',
109+
'ᵋ', 'ᴽ', 'ᴻ', 'ᴲ', 'ᴯ', 'ᴭ', '℠', '™', 'ₑ', 'ₕ', 'ᵢ', 'ⱼ', 'ₖ', 'ₙ', 'ₒ', 'ᵣ',
110+
'ₜ', 'ᵤ', 'ᵥ', 'ₓ', '₎', '₍', '₌', '₋', '‼', '⁉', 'Ϻ', 'Ϡ', 'Ϸ', 'Ϭ', 'Ϣ', 'Ϛ',
111+
'⋐', '⋑', '☺', '𝐀', '𝐁', '𝐂', '𝐃', '𝐄', '𝐅', '𝐆', '𝐇', '𝐈', '𝐉', '𝐊', '𝐋', '𝐌',
112+
'𝐍', '𝐎', '𝐏', '𝐐', '𝐑', '𝐒', '𝐓', '𝐔', '𝐕', '𝐖', '𝐗', '𝐘', '𝐙', '𝐚', '𝐛', '𝐜',
113+
'𝐝', '𝐞', '𝐟', '𝐠', '𝐡', '𝐢', '𝐣', '𝐤', '𝐥', '𝐦', '𝐧', '𝐨', '𝐩', '𝐪', '𝐫', '𝐬',
114+
'𝐭', '𝐮', '𝐯', '𝐰', '𝐱', '𝐲', '𝐳', '𝐴', '𝐵', '𝐶', '𝐷', '𝐸', '𝐹', '𝐺', '𝐻', '𝐼',
115+
'𝐽', '𝐾', '𝐿', '𝑀', '𝑁', '𝑂', '𝑃', '𝑄', '𝑅', '𝑆', '𝑇', '𝑈', '𝑉', '𝑊', '𝑋', '𝑌',
116+
'𝑍', '𝑎', '𝑏', '𝑐', '𝑑', '𝑒', '𝑓', '𝑔', '𝑖', '𝑗', '𝑘', '𝑙', '𝑚', '𝑛', '𝑜', '𝑝',
117+
'𝑞', '𝑟', '𝑠', '𝑡', '𝑢', '𝑣', '𝑤', '𝑥', '𝑦', '𝑧', '𝑨', '𝑩', '𝑪', '𝑫', '𝑬', '𝑭',
118+
'𝑮', '𝑯', '𝑰', '𝑱', '𝑲', '𝑳', '𝑴', '𝑵', '𝑶', '𝑷', '𝑸', '𝑹', '𝑺', '𝑻', '𝑼', '𝑽',
119+
'𝑾', '𝑿', '𝒀', '𝒁', '𝒂', '𝒃', '𝒄', '𝒅', '𝒆', '𝒇', '𝒈', '𝒉', '𝒊', '𝒋', '𝒌', '𝒍',
120+
'𝒎', '𝒏', '𝒐', '𝒑', '𝒒', '𝒓', '𝒔', '𝒕', '𝒖', '𝒗', '𝒘', '𝒙', '𝒚', '𝒛', '𝒜', 'ℬ',
121+
'𝒞', '𝒟', 'ℰ', 'ℱ', '𝒢', 'ℋ', '𝒥', '𝒦', 'ℒ', 'ℳ', '𝒩', '𝒪', '𝒬', 'ℛ', '𝒮', '𝒯',
122+
'𝒰', '𝒱', '𝒲', '𝒳', '𝒴', '𝒵', '𝒶', '𝒷', '𝒸', '𝒹', 'ℯ', '𝒻', 'ℊ', '𝒽', '𝒾', '𝒿',
123+
'𝓀', '𝓁', '𝓂', '𝓃', 'ℴ', '𝓅', '𝓆', '𝓇', '𝓈', '𝓉', '𝓊', '𝓋', '𝓌', '𝓍', '𝓎', '𝓏',
124+
'𝓐', '𝓑', '𝓒', '𝓓', '𝓔', '𝓕', '𝓖', '𝓗', '𝓘', '𝓙', '𝓚', '𝓛', '𝓜', '𝓝', '𝓞', '𝓟',
125+
'𝓠', '𝓡', '𝓢', '𝓣', '𝓤', '𝓥', '𝓦', '𝓧', '𝓨', '𝓩', '𝓪', '𝓫', '𝓬', '𝓭', '𝓮', '𝓯',
126+
'𝓰', '𝓱', '𝓲', '𝓳', '𝓴', '𝓵', '𝓶', '𝓷', '𝓸', '𝓹', '𝓺', '𝓻', '𝓼', '𝓽', '𝓾', '𝓿',
127+
'𝔀', '𝔁', '𝔂', '𝔃', '𝔄', '𝔅', 'ℭ', '𝔇', '𝔈', '𝔉', '𝔊', 'ℌ', '𝔍', '𝔎', '𝔏', '𝔐',
128+
'𝔑', '𝔒', '𝔓', '𝔔', '𝔖', '𝔗', '𝔘', '𝔙', '𝔚', '𝔛', '𝔜', 'ℨ', '𝔞', '𝔟', '𝔠', '𝔡',
129+
'𝔢', '𝔣', '𝔤', '𝔥', '𝔦', '𝔧', '𝔨', '𝔩', '𝔪', '𝔫', '𝔬', '𝔭', '𝔮', '𝔯', '𝔰', '𝔱',
130+
'𝔲', '𝔳', '𝔴', '𝔵', '𝔶', '𝔷', '¥', 'ϱ', 'ϰ', 'ϗ', 'ϖ', 'ϕ', 'ϑ', '⊲', '⊳', 'ϐ',
131+
'⊻', 'ě', 'Ě', '⋮', 'ď', 'Ď', 'č', 'Č', 'ϟ', '₭', 'į', 'Į', 'K', 'ϧ', '⚠', '℘',
132+
'≀', 'Ϯ', 'Ϝ', 'Ð', 'Η', '≎', '𝔻', '𝔼', '𝔾', '𝕀', '𝕁', '𝕃', '𝕄', '𝕆', '𝕊', '𝕋',
133+
'𝕌', '𝕍', '𝕎', '𝕏', '𝕐', '𝕒', '𝕓', '𝕔', '𝕕', '𝕖', '𝕗', '𝕘', '𝕙', '𝕚', '𝕛', '𝕜',
134+
'𝕝', '𝕞', '𝕟', '𝕠', '𝕡', '𝕢', '𝕣', '𝕤', '𝕥', '𝕦', '𝕧', '𝕨', '𝕩', '𝕪', '𝕫', '⨯',
135+
'⨿', 'Ϳ', '⧾', '⧿', '¡'
136+
]
137+
138+
/--
139+
Other unicode characters present in Mathlib but not present in any of the above lists
140+
(as of March 17, 2026).
141+
142+
This list may be extended by more characters for which no VSCode-extension abbreviation exists.
143+
Some guidelines:
144+
- No invisible characters
145+
- No characters affecting text directionality (Mathlib uses left-to-right text).
146+
- No [Private Use Areas](https://en.wikipedia.org/wiki/Private_Use_Areas)
147+
148+
The list `withVSCodeAbbrev` contains characters with abbreviations; if an abbreviation
149+
already has been defined, you might need to update the list `withVSCodeAbbrev`.
150+
Further, consider proposing an abbreviation to [the extension](https://github.com/leanprover/vscode-lean4) for new symbols.
151+
152+
Empty lines for improved readability (otherwise some characters overlap in some fonts).
153+
-/
154+
def othersInMathlib : Array Char := #[
155+
'✔', '⟍', 'ł', 'ń', '⎯', '⏐', 'ć', 'š', '̂', 'ᘁ', '𝖣', 'ß', 'ỳ', '⤏',
156+
157+
'┌', '┐', '│', '├', '└', '┬', '┘', '▼', '◄', '⋅', 'ś', '-', '\', '◥', '/', '◢',
158+
159+
'◿', '◹', 'ő', '⥥', '⤞', '⥢', '╱', '⟋', 'Ž', 'ą', 'Š', 'ầ', ':', '꙳', '⎛',
160+
161+
'⎞', '⎜', '⎟', '⎝', '⎠', 'ă', 'ĝ', 'ᵧ', '▶', '‑', '‾', 'ř', '⏎', '‐', '𐞥',
162+
163+
'ꟴ', 'ᵟ', 'ᴀ', 'ʙ', 'ᴄ', 'ᴅ', 'ᴇ', 'ꜰ', 'ɢ', 'ʜ', 'ɪ', 'ᴊ', 'ᴋ', 'ʟ', 'ᴍ', 'ɴ',
164+
165+
'ᴏ', 'ᴘ', 'ꞯ', 'ʀ', 'ꜱ', 'ᴛ', 'ᴜ', 'ᴠ', 'ᴡ', 'ʏ', 'ᴢ', 'ᵦ', 'ᵨ', 'ᵩ', 'ᵪ', 'Ś', 'ę'
166+
]
167+
168+
/-- Unicode symbols in mathlib that should always be followed by the emoji variant selector. -/
169+
public def emojis : Array Char := #[
170+
'\u2705', -- ✅️
171+
'\u274C', -- ❌️
172+
-- Note: writing e.g. '\u1F4A5' gives error: "missing end of character literal"
173+
-- see https://github.com/leanprover/lean4/blob/4eea57841d1012d6c2edab0f270e433d43f92520/src/Lean/Parser/Basic.lean#L709
174+
.ofNat 0x1F4A5, -- 💥️
175+
.ofNat 0x1F7E1, -- 🟡️
176+
.ofNat 0x1F4A1, -- 💡️
177+
.ofNat 0x1F419, -- 🐙️
178+
.ofNat 0x1F50D, -- 🔍️
179+
.ofNat 0x1F389, -- 🎉️
180+
'\u23F3', -- ⏳️
181+
.ofNat 0x1F3C1 -- 🏁️
182+
]
183+
184+
/-- Unicode symbols in mathlib that should always be followed by the text variant selector. -/
185+
public def nonEmojis : Array Char := #[]
186+
42187
/-- Blocklist: If `false`, the character is not allowed in Mathlib.
43188
Note: if `true`, a character might still not be allowed depending on context
44189
(e.g. misplaced variant selectors).
45190
-/
46-
public def isAllowedCharacter (c : Char) : Bool := !#[
47-
'\u0009', -- "TAB"
48-
'\u000B', -- "LINE TABULATION"
49-
'\u000C', -- "FORM FEED"
50-
'\u000D', -- "CARRIAGE RETURN" (note: CRLF is treated separately before `unicodeLinter` starts)
51-
'\u001F', -- "INFORMATION SEPARATOR ONE"
52-
'\u00A0', -- "NO-BREAK SPACE"
53-
'\u1680', -- "OGHAM SPACE MARK"
54-
'\u2000', -- "EN QUAD"
55-
'\u2001', -- "EM QUAD"
56-
'\u2002', -- "EN SPACE"
57-
'\u2003', -- "EM SPACE"
58-
'\u2004', -- "THREE-PER-EM SPACE"
59-
'\u2005', -- "FOUR-PER-EM SPACE"
60-
'\u2006', -- "SIX-PER-EM SPACE"
61-
'\u2007', -- "FIGURE SPACE"
62-
'\u2008', -- "PUNCTUATION SPACE"
63-
'\u2009', -- "THIN SPACE"
64-
'\u200A', -- "HAIR SPACE"
65-
'\u200B', -- "ZERO WIDTH SPACE"
66-
'\u200C', -- "ZERO WIDTH NON-JOINER"
67-
'\u200D', -- "ZERO WIDTH JOINER"
68-
'\u200E', -- "LEFT-TO-RIGHT MARK"
69-
'\u200F', -- "RIGHT-TO-LEFT MARK"
70-
'\u2028', -- "LINE SEPARATOR"
71-
'\u2029', -- "PARAGRAPH SEPARATOR"
72-
'\u202A', -- "LEFT-TO-RIGHT EMBEDDING"
73-
'\u202B', -- "RIGHT-TO-LEFT EMBEDDING"
74-
'\u202C', -- "POP DIRECTIONAL FORMATTING"
75-
'\u202D', -- "LEFT-TO-RIGHT OVERRIDE"
76-
'\u202E', -- "RIGHT-TO-LEFT OVERRIDE"
77-
'\u202F', -- "NARROW NO-BREAK SPACE"
78-
'\u205F', -- "MEDIUM MATHEMATICAL SPACE"
79-
'\u2060', -- "WORD JOINER"
80-
'\u2062', -- "INVISIBLE TIMES"
81-
'\u2063', -- "INVISIBLE SEPARATOR"
82-
'\u2064', -- "INVISIBLE PLUS"
83-
'\u2066', -- "LEFT-TO-RIGHT ISOLATE"
84-
'\u2067', -- "RIGHT-TO-LEFT ISOLATE"
85-
'\u2068', -- "FIRST STRONG ISOLATE"
86-
'\u2069', -- "POP DIRECTIONAL ISOLATE"
87-
'\u206A', -- "INHIBIT SYMMETRIC SWAPPING"
88-
'\u206B', -- "ACTIVATE SYMMETRIC SWAPPING"
89-
'\u206C', -- "INHIBIT ARABIC FORM SHAPING"
90-
'\u206D', -- "ACTIVATE ARABIC FORM SHAPING"
91-
'\u206E', -- "NATIONAL DIGIT SHAPES"
92-
'\u206F', -- "NOMINAL DIGIT SHAPES"
93-
'\u3000' -- "IDEOGRAPHIC SPACE"
94-
].contains c
191+
public def isAllowedCharacter (c : Char) : Bool :=
192+
ASCII.allowed c
193+
|| withVSCodeAbbrev.contains c
194+
|| emojis.contains c
195+
|| nonEmojis.contains c
196+
|| c == UnicodeVariant.emoji
197+
|| c == UnicodeVariant.text
198+
|| othersInMathlib.contains c
95199

96200
/-- Provide default replacement (`String`) for a blocklisted character, or `none` if none defined -/
97201
public def replaceDisallowed : Char → Option String
@@ -144,22 +248,4 @@ public def replaceDisallowed : Char → Option String
144248
| '\u3000' => " " -- "IDEOGRAPHIC SPACE" => "2 spaces"
145249
| _ => none
146250

147-
/-- Unicode symbols in mathlib that should always be followed by the emoji variant selector. -/
148-
public def emojis : Array Char := #[
149-
'\u2705', -- ✅️
150-
'\u274C', -- ❌️
151-
-- TODO "missing end of character literal" if written as '\u1F4A5'
152-
-- see https://github.com/leanprover/lean4/blob/4eea57841d1012d6c2edab0f270e433d43f92520/src/Lean/Parser/Basic.lean#L709
153-
.ofNat 0x1F4A5, -- 💥️
154-
.ofNat 0x1F7E1, -- 🟡️
155-
.ofNat 0x1F4A1, -- 💡️
156-
.ofNat 0x1F419, -- 🐙️
157-
.ofNat 0x1F50D, -- 🔍️
158-
.ofNat 0x1F389, -- 🎉️
159-
'\u23F3', -- ⏳️
160-
.ofNat 0x1F3C1 ] -- 🏁️
161-
162-
/-- Unicode symbols in mathlib that should always be followed by the text variant selector. -/
163-
public def nonEmojis : Array Char := #[]
164-
165251
end Mathlib.Linter.TextBased.UnicodeLinter

Mathlib/Topology/Baire/LocallyCompactRegular.lean

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ instance (priority := 100) BaireSpace.of_t2Space_locallyCompactSpace : BaireSpac
4646
refine ⟨fun n ↦ ?_, fun n ↦ (hK_next n _).trans inter_subset_left, hK₀⟩
4747
exact subset_closure.trans <| (hK_next _ _).trans <|
4848
inter_subset_right.trans interior_subset
49-
-- Prove that ̀`⋂ n : ℕ, closure (K n)` is inside `U ∩ ⋂ n : ℕ, f n`.
49+
-- Prove that `⋂ n : ℕ, closure (K n)` is inside `U ∩ ⋂ n : ℕ, f n`.
5050
have hK_subset : (⋂ n, closure (K n) : Set X) ⊆ U ∩ ⋂ n, f n := fun x hx ↦ by
5151
simp only [mem_iInter, mem_inter_iff] at hx ⊢
5252
exact ⟨hKU <| hx 0, fun n ↦ hKf n <| hx (n + 1)⟩

0 commit comments

Comments
 (0)