From c7929c7a285fbc5362b3448ac3d1397e267256df Mon Sep 17 00:00:00 2001 From: Isaac Muse Date: Fri, 12 Apr 2019 20:57:39 -0600 Subject: [PATCH] All pseudo-classes names are case insensitive and allow CSS escapes (#141) --- docs/src/markdown/about/changelog.md | 1 + soupsieve/css_parser.py | 155 +++++++++++++++++++-------- tests/test_extra/test_custom.py | 15 +++ tests/test_level1/test_link.py | 7 ++ tests/test_level3/test_nth_child.py | 35 ++++++ 5 files changed, 168 insertions(+), 45 deletions(-) diff --git a/docs/src/markdown/about/changelog.md b/docs/src/markdown/about/changelog.md index 575905d7..5dcbf722 100644 --- a/docs/src/markdown/about/changelog.md +++ b/docs/src/markdown/about/changelog.md @@ -7,6 +7,7 @@ for which the element under consideration applies. - **FIX**: HTML pseudo-classes will check that all key elements checked are in the XHTML namespace (HTML parsers that do not provide namespaces will assume the XHTML namespace). +- **FIX**: Ensure that all pseudo-classes names are case insensitive and allow CSS escapes. ## 1.9.0 diff --git a/soupsieve/css_parser.py b/soupsieve/css_parser.py index d20ebf06..d8244565 100644 --- a/soupsieve/css_parser.py +++ b/soupsieve/css_parser.py @@ -4,7 +4,6 @@ from . import util from . import css_match as cm from . import css_types as ct -from collections import OrderedDict from .util import SelectorSyntaxError UNICODE_REPLACEMENT_CHAR = 0xFFFD @@ -132,6 +131,8 @@ '''.format(ws=WSC, ident=IDENTIFIER, attr=QUIRKS_ATTR) # Pseudo class (`:pseudo-class`, `:pseudo-class(`) PAT_PSEUDO_CLASS = r'(?P:{ident})(?P\({ws}*)?'.format(ws=WSC, ident=IDENTIFIER) +# Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes. +PAT_PSEUDO_CLASS_SPECIAL = r'(?P:{ident})(?P\({ws}*)'.format(ws=WSC, ident=IDENTIFIER) # Custom pseudo class (`:--custom-pseudo`) PAT_PSEUDO_CLASS_CUSTOM = r'(?P:(?=--){ident})'.format(ident=IDENTIFIER) # Closing pseudo group (`)`) @@ -142,22 +143,26 @@ PAT_AT_RULE = r'@P{ident}'.format(ident=IDENTIFIER) # Pseudo class `nth-child` (`:nth-child(an+b [of S]?)`, `:first-child`, etc.) PAT_PSEUDO_NTH_CHILD = r''' -(?P:nth-(?:last-)?child -\({ws}*(?P{nth}|even|odd))(?:{wsc}*\)|(?P{comments}*{ws}{wsc}*of{comments}*{ws}{wsc}*)) -'''.format(wsc=WSC, comments=COMMENTS, ws=WS, nth=NTH) +(?P{name} +(?P{nth}|even|odd))(?:{wsc}*\)|(?P{comments}*{ws}{wsc}*of{comments}*{ws}{wsc}*)) +'''.format(name=PAT_PSEUDO_CLASS_SPECIAL, wsc=WSC, comments=COMMENTS, ws=WS, nth=NTH) # Pseudo class `nth-of-type` (`:nth-of-type(an+b)`, `:first-of-type`, etc.) PAT_PSEUDO_NTH_TYPE = r''' -(?P:nth-(?:last-)?of-type -\({ws}*(?P{nth}|even|odd)){ws}*\) -'''.format(ws=WSC, nth=NTH) +(?P{name} +(?P{nth}|even|odd)){ws}*\) +'''.format(name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, nth=NTH) # Pseudo class language (`:lang("*-de", en)`) -PAT_PSEUDO_LANG = r':lang\({ws}*(?P{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(ws=WSC, value=VALUE) +PAT_PSEUDO_LANG = r'{name}(?P{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format( + name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, value=VALUE +) # Pseudo class direction (`:dir(ltr)`) -PAT_PSEUDO_DIR = r':dir\({ws}*(?Pltr|rtl){ws}*\)'.format(ws=WSC) +PAT_PSEUDO_DIR = r'{name}(?Pltr|rtl){ws}*\)'.format(name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC) # Combining characters (`>`, `~`, ` `, `+`, `,`) PAT_COMBINE = r'{wsc}*?(?P[,+>~]|{ws}(?![,+>~])){wsc}*'.format(ws=WS, wsc=WSC) # Extra: Contains (`:contains(text)`) -PAT_PSEUDO_CONTAINS = r':contains\({ws}*(?P{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(ws=WSC, value=VALUE) +PAT_PSEUDO_CONTAINS = r'{name}(?P{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format( + name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, value=VALUE +) # Regular expressions # CSS escape pattern @@ -230,7 +235,7 @@ def process_custom(custom): raise SelectorSyntaxError("The name '{}' is not a valid custom pseudo-class name".format(name)) if name in custom_selectors: raise KeyError("The custom selector '{}' has already been registered".format(name)) - custom_selectors[name] = value + custom_selectors[css_unescape(name)] = value return custom_selectors @@ -292,16 +297,69 @@ def escape(ident): class SelectorPattern(object): """Selector pattern.""" - def __init__(self, pattern): + def __init__(self, name, pattern): + """Initialize.""" + + self.name = name + self.re_pattern = re.compile(pattern, re.I | re.X | re.U) + + def get_name(self): + """Get name.""" + + return self.name + + def enabled(self, flags): + """Enabled.""" + + return True + + def match(self, selector, index): + """Match the selector.""" + + return self.re_pattern.match(selector, index) + + +class SpecialPseudoPattern(SelectorPattern): + """Selector pattern.""" + + def __init__(self, patterns): """Initialize.""" - self.pattern = re.compile(pattern, re.I | re.X | re.U) + self.patterns = {} + for p in patterns: + name = p[0] + pattern = SelectorPattern(name, p[2]) + for pseudo in p[1]: + self.patterns[pseudo] = pattern + + self.matched_name = None + self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U) + + def get_name(self): + """Get name.""" + + return self.matched_name.get_name() def enabled(self, flags): """Enabled.""" return True + def match(self, selector, index): + """Match the selector.""" + + pseudo = None + m = self.re_pseudo_name.match(selector, index) + if m: + name = util.lower(css_unescape(m.group('name'))) + pattern = self.patterns.get(name) + if pattern: + pseudo = pattern.match(selector, index) + if pseudo: + self.matched_name = pattern + + return pseudo + class QuirkPattern(SelectorPattern): """Selector pattern for quirk mode.""" @@ -384,25 +442,27 @@ def __str__(self): # pragma: no cover class CSSParser(object): """Parse CSS selectors.""" - css_tokens = OrderedDict( - [ - ("pseudo_close", SelectorPattern(PAT_PSEUDO_CLOSE)), - ("pseudo_contains", SelectorPattern(PAT_PSEUDO_CONTAINS)), - ("pseudo_nth_child", SelectorPattern(PAT_PSEUDO_NTH_CHILD)), - ("pseudo_nth_type", SelectorPattern(PAT_PSEUDO_NTH_TYPE)), - ("pseudo_lang", SelectorPattern(PAT_PSEUDO_LANG)), - ("pseudo_dir", SelectorPattern(PAT_PSEUDO_DIR)), - ("pseudo_class_custom", SelectorPattern(PAT_PSEUDO_CLASS_CUSTOM)), - ("pseudo_class", SelectorPattern(PAT_PSEUDO_CLASS)), - ("pseudo_element", SelectorPattern(PAT_PSEUDO_ELEMENT)), - ("at_rule", SelectorPattern(PAT_AT_RULE)), - ("id", SelectorPattern(PAT_ID)), - ("class", SelectorPattern(PAT_CLASS)), - ("tag", SelectorPattern(PAT_TAG)), - ("attribute", SelectorPattern(PAT_ATTR)), - ("quirks_attribute", QuirkPattern(PAT_QUIRKS_ATTR)), - ("combine", SelectorPattern(PAT_COMBINE)) - ] + css_tokens = ( + SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE), + SpecialPseudoPattern( + ( + ("pseudo_contains", (':contains',), PAT_PSEUDO_CONTAINS), + ("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD), + ("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE), + ("pseudo_lang", (':lang',), PAT_PSEUDO_LANG), + ("pseudo_dir", (':dir',), PAT_PSEUDO_DIR) + ) + ), + SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM), + SelectorPattern("pseudo_class", PAT_PSEUDO_CLASS), + SelectorPattern("pseudo_element", PAT_PSEUDO_ELEMENT), + SelectorPattern("at_rule", PAT_AT_RULE), + SelectorPattern("id", PAT_ID), + SelectorPattern("class", PAT_CLASS), + SelectorPattern("tag", PAT_TAG), + SelectorPattern("attribute", PAT_ATTR), + QuirkPattern("quirks_attribute", PAT_QUIRKS_ATTR), + SelectorPattern("combine", PAT_COMBINE) ) def __init__(self, selector, custom=None, flags=0): @@ -511,7 +571,7 @@ def parse_pseudo_class_custom(self, sel, m, has_selector): set it to `None` in the dictionary so we can avoid an infinite loop. """ - pseudo = util.lower(m.group('name')) + pseudo = util.lower(css_unescape(m.group('name'))) selector = self.custom.get(pseudo) if selector is None: raise SelectorSyntaxError( @@ -535,7 +595,7 @@ def parse_pseudo_class(self, sel, m, has_selector, iselector, is_html): """Parse pseudo class.""" complex_pseudo = False - pseudo = util.lower(m.group('name')) + pseudo = util.lower(css_unescape(m.group('name'))) if m.group('open'): complex_pseudo = True if complex_pseudo and pseudo in PSEUDO_COMPLEX: @@ -623,8 +683,12 @@ def parse_pseudo_nth(self, sel, m, has_selector, iselector): """Parse `nth` pseudo.""" mdict = m.groupdict() - postfix = '_child' if mdict.get('pseudo_nth_child') else '_type' - content = mdict.get('nth' + postfix) + if mdict.get('pseudo_nth_child'): + postfix = '_child' + else: + postfix = '_type' + mdict['name'] = util.lower(css_unescape(mdict['name'])) + content = util.lower(mdict.get('nth' + postfix)) if content == 'even': # 2n s1 = 2 @@ -654,7 +718,7 @@ def parse_pseudo_nth(self, sel, m, has_selector, iselector): s1 = int(s1, 10) s2 = int(s2, 10) - pseudo_sel = util.lower(m.group('pseudo_nth' + postfix)) + pseudo_sel = mdict['name'] if postfix == '_child': if m.group('of'): # Parse the rest of `of S`. @@ -662,14 +726,14 @@ def parse_pseudo_nth(self, sel, m, has_selector, iselector): else: # Use default `*|*` for `of S`. nth_sel = CSS_NTH_OF_S_DEFAULT - if pseudo_sel.startswith(':nth-child'): + if pseudo_sel == ':nth-child': sel.nth.append(ct.SelectorNth(s1, var, s2, False, False, nth_sel)) - elif pseudo_sel.startswith(':nth-last-child'): + elif pseudo_sel == ':nth-last-child': sel.nth.append(ct.SelectorNth(s1, var, s2, False, True, nth_sel)) else: - if pseudo_sel.startswith(':nth-of-type'): + if pseudo_sel == ':nth-of-type': sel.nth.append(ct.SelectorNth(s1, var, s2, True, False, ct.SelectorList())) - elif pseudo_sel.startswith(':nth-last-of-type'): + elif pseudo_sel == ':nth-last-of-type': sel.nth.append(ct.SelectorNth(s1, var, s2, True, True, ct.SelectorList())) has_selector = True return has_selector @@ -1007,15 +1071,16 @@ def selector_iter(self, pattern): print('## PARSING: {!r}'.format(pattern)) while index <= end: m = None - for k, v in self.css_tokens.items(): + for v in self.css_tokens: if not v.enabled(self.flags): # pragma: no cover continue - m = v.pattern.match(pattern, index) + m = v.match(pattern, index) if m: + name = v.get_name() if self.debug: # pragma: no cover - print("TOKEN: '{}' --> {!r} at position {}".format(k, m.group(0), m.start(0))) + print("TOKEN: '{}' --> {!r} at position {}".format(name, m.group(0), m.start(0))) index = m.end(0) - yield k, m + yield name, m break if m is None: c = pattern[index] diff --git a/tests/test_extra/test_custom.py b/tests/test_extra/test_custom.py index d2d90564..465d627b 100644 --- a/tests/test_extra/test_custom.py +++ b/tests/test_extra/test_custom.py @@ -48,6 +48,21 @@ def test_custom_selectors(self): flags=util.HTML ) + def test_custom_escapes(self): + """Test custom selectors with escapes.""" + + custom_selectors = { + r":--Header\s": "h1, h2, h3, h4, h5, h6" + } + + self.assert_selector( + self.MARKUP, + r':--\HeaderS', + ['1', '2'], + custom=custom_selectors, + flags=util.HTML + ) + def test_custom_selectors_exotic(self): """Test custom selectors.""" diff --git a/tests/test_level1/test_link.py b/tests/test_level1/test_link.py index a31e0ff7..f2ad6a64 100644 --- a/tests/test_level1/test_link.py +++ b/tests/test_level1/test_link.py @@ -25,6 +25,13 @@ def test_link(self): flags=util.HTML ) + self.assert_selector( + self.MARKUP, + r":\liNk", + ["2"], + flags=util.HTML + ) + def test_tag_and_link(self): """Test link and tag (all links are unvisited).""" diff --git a/tests/test_level3/test_nth_child.py b/tests/test_level3/test_nth_child.py index 867ebb95..9decc0dc 100644 --- a/tests/test_level3/test_nth_child.py +++ b/tests/test_level3/test_nth_child.py @@ -42,6 +42,20 @@ def test_nth_child(self): flags=util.HTML ) + self.assert_selector( + markup, + "p:NTH-CHILD(2)", + ['1'], + flags=util.HTML + ) + + self.assert_selector( + markup, + r"p:NT\H-CH\ILD(2)", + ['1'], + flags=util.HTML + ) + def test_nth_child_odd(self): """Test `nth` child odd.""" @@ -69,6 +83,13 @@ def test_nth_child_odd(self): flags=util.HTML ) + self.assert_selector( + markup, + "p:nth-child(ODD)", + ['0', '8', '10'], + flags=util.HTML + ) + def test_nth_child_even(self): """Test `nth` child even.""" @@ -96,6 +117,13 @@ def test_nth_child_even(self): flags=util.HTML ) + self.assert_selector( + markup, + "p:nth-child(EVEN)", + ['1', '7', '9'], + flags=util.HTML + ) + def test_nth_child_complex(self): """Test `nth` child complex.""" @@ -123,6 +151,13 @@ def test_nth_child_complex(self): flags=util.HTML ) + self.assert_selector( + markup, + "p:nth-child(2N-5)", + ['0', '8', '10'], + flags=util.HTML + ) + self.assert_selector( markup, "p:nth-child(-2n+20)",