Skip to content

Commit

Permalink
pythongh-126505: Fix bugs in compiling case-insensitive character cla…
Browse files Browse the repository at this point in the history
…sses (pythonGH-126557)

* upper-case non-BMP character was ignored
* the ASCII flag was ignored when matching a character range whose
  upper bound is beyond the BMP region
(cherry picked from commit 819830f)

Co-authored-by: Serhiy Storchaka <[email protected]>
  • Loading branch information
serhiy-storchaka authored and miss-islington committed Nov 11, 2024
1 parent aee80cd commit a6006ca
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 9 deletions.
23 changes: 14 additions & 9 deletions Lib/re/_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,19 +250,19 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
while True:
try:
if op is LITERAL:
if fixup:
lo = fixup(av)
charmap[lo] = 1
if fixes and lo in fixes:
for k in fixes[lo]:
if fixup: # IGNORECASE and not LOCALE
av = fixup(av)
charmap[av] = 1
if fixes and av in fixes:
for k in fixes[av]:
charmap[k] = 1
if not hascased and iscased(av):
hascased = True
else:
charmap[av] = 1
elif op is RANGE:
r = range(av[0], av[1]+1)
if fixup:
if fixup: # IGNORECASE and not LOCALE
if fixes:
for i in map(fixup, r):
charmap[i] = 1
Expand All @@ -289,8 +289,7 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
# Character set contains non-BMP character codes.
# For range, all BMP characters in the range are already
# proceeded.
if fixup:
hascased = True
if fixup: # IGNORECASE and not LOCALE
# For now, IN_UNI_IGNORE+LITERAL and
# IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP
# characters, because two characters (at least one of
Expand All @@ -301,7 +300,13 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
# Also, both c.lower() and c.lower().upper() are single
# characters for every non-BMP character.
if op is RANGE:
op = RANGE_UNI_IGNORE
if fixes: # not ASCII
op = RANGE_UNI_IGNORE
hascased = True
else:
assert op is LITERAL
if not hascased and iscased(av):
hascased = True
tail.append((op, av))
break

Expand Down
55 changes: 55 additions & 0 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -1073,6 +1073,39 @@ def test_ignore_case_set(self):
self.assertTrue(re.match(br'[19a]', b'a', re.I))
self.assertTrue(re.match(br'[19a]', b'A', re.I))
self.assertTrue(re.match(br'[19A]', b'a', re.I))
self.assertTrue(re.match(r'[19\xc7]', '\xc7', re.I))
self.assertTrue(re.match(r'[19\xc7]', '\xe7', re.I))
self.assertTrue(re.match(r'[19\xe7]', '\xc7', re.I))
self.assertTrue(re.match(r'[19\xe7]', '\xe7', re.I))
self.assertTrue(re.match(r'[19\u0400]', '\u0400', re.I))
self.assertTrue(re.match(r'[19\u0400]', '\u0450', re.I))
self.assertTrue(re.match(r'[19\u0450]', '\u0400', re.I))
self.assertTrue(re.match(r'[19\u0450]', '\u0450', re.I))
self.assertTrue(re.match(r'[19\U00010400]', '\U00010400', re.I))
self.assertTrue(re.match(r'[19\U00010400]', '\U00010428', re.I))
self.assertTrue(re.match(r'[19\U00010428]', '\U00010400', re.I))
self.assertTrue(re.match(r'[19\U00010428]', '\U00010428', re.I))

self.assertTrue(re.match(br'[19A]', b'A', re.I))
self.assertTrue(re.match(br'[19a]', b'a', re.I))
self.assertTrue(re.match(br'[19a]', b'A', re.I))
self.assertTrue(re.match(br'[19A]', b'a', re.I))
self.assertTrue(re.match(r'[19A]', 'A', re.I|re.A))
self.assertTrue(re.match(r'[19a]', 'a', re.I|re.A))
self.assertTrue(re.match(r'[19a]', 'A', re.I|re.A))
self.assertTrue(re.match(r'[19A]', 'a', re.I|re.A))
self.assertTrue(re.match(r'[19\xc7]', '\xc7', re.I|re.A))
self.assertIsNone(re.match(r'[19\xc7]', '\xe7', re.I|re.A))
self.assertIsNone(re.match(r'[19\xe7]', '\xc7', re.I|re.A))
self.assertTrue(re.match(r'[19\xe7]', '\xe7', re.I|re.A))
self.assertTrue(re.match(r'[19\u0400]', '\u0400', re.I|re.A))
self.assertIsNone(re.match(r'[19\u0400]', '\u0450', re.I|re.A))
self.assertIsNone(re.match(r'[19\u0450]', '\u0400', re.I|re.A))
self.assertTrue(re.match(r'[19\u0450]', '\u0450', re.I|re.A))
self.assertTrue(re.match(r'[19\U00010400]', '\U00010400', re.I|re.A))
self.assertIsNone(re.match(r'[19\U00010400]', '\U00010428', re.I|re.A))
self.assertIsNone(re.match(r'[19\U00010428]', '\U00010400', re.I|re.A))
self.assertTrue(re.match(r'[19\U00010428]', '\U00010428', re.I|re.A))

# Two different characters have the same lowercase.
assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K'
Expand Down Expand Up @@ -1109,8 +1142,10 @@ def test_ignore_case_range(self):
self.assertTrue(re.match(br'[9-a]', b'_', re.I))
self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
self.assertTrue(re.match(r'[\xc0-\xde]', '\xe7', re.I))
self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xc7', re.I))
self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
Expand All @@ -1121,6 +1156,26 @@ def test_ignore_case_range(self):
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))

self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I|re.A))
self.assertIsNone(re.match(r'[\xc0-\xde]', '\xe7', re.I|re.A))
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I|re.A))
self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xc7', re.I|re.A))
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I|re.A))
self.assertIsNone(re.match(r'[\u0430-\u045f]', '\u0400', re.I|re.A))
self.assertIsNone(re.match(r'[\u0400-\u042f]', '\u0450', re.I|re.A))
self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I|re.A))
self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I|re.A))
self.assertIsNone(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I|re.A))
self.assertIsNone(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I|re.A))
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I|re.A))

self.assertTrue(re.match(r'[N-\x7f]', 'A', re.I|re.A))
self.assertTrue(re.match(r'[n-\x7f]', 'Z', re.I|re.A))
self.assertTrue(re.match(r'[N-\uffff]', 'A', re.I|re.A))
self.assertTrue(re.match(r'[n-\uffff]', 'Z', re.I|re.A))
self.assertTrue(re.match(r'[N-\U00010000]', 'A', re.I|re.A))
self.assertTrue(re.match(r'[n-\U00010000]', 'Z', re.I|re.A))

# Two different characters have the same lowercase.
assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K'
self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Fix bugs in compiling case-insensitive :mod:`regular expressions <re>` with
character classes containing non-BMP characters: upper-case non-BMP
character did was ignored and the ASCII flag was ignored when
matching a character range whose upper bound is beyond the BMP region.

0 comments on commit a6006ca

Please sign in to comment.