#!/usr/bin/env python3 """ html5validate ------ HTML5Lib based HTML5 validation module. MIT Licence - (C) 2019 Daniel Fairhead """ import warnings from collections import namedtuple import re from xml.dom import Node import html5lib # in case we do need to track elements, here are some objects to hold them in: DocType = namedtuple('DocType', ('name', 'publicId', 'systemId')) StartTag = namedtuple('StartTag', ('name', 'attributes')) VoidTag = namedtuple('VoidTag', ('name', 'attributes', 'has_children')) EndTag = namedtuple('EndTag', ('name')) Entity = namedtuple('Entity', ('name',)) SpaceCharacters = namedtuple('SpaceCharacters', ('data')) Characters = namedtuple('Characters', ('data')) Comment = namedtuple('Comment', ('data')) # Splits apart. TEXT_MATCH = re.compile(r'(\s*)(\S?.*\S)(\s*)') from html5lib.html5parser import ParseError class HTML5Invalid(Exception): pass #class ParseError(HTML5Invalid): # pass #class LintError(HTML5Invalid): # pass class ValidationException(HTML5Invalid): pass class InvalidTag(ValidationException): pass class EmptyPage(ValidationException): pass class MisplacedElement(ValidationException): pass class InvalidAttribute(ValidationException): pass class NonSecureRequestInSecurePage(ValidationException): pass class UnclosedTags(ValidationException): pass # 8. Namespaces: namespaces = { 'html': 'http://www.w3.org/1999/xhtml', 'mathml': "http://www.w3.org/1998/Math/MathML", 'svg': "http://www.w3.org/2000/svg", 'xlink': "http://www.w3.org/1999/xlink", 'xml': "http://www.w3.org/XML/1998/namespace", 'xmlns': "http://www.w3.org/2000/xmlns/", } metadata_elements = frozenset(('base','link','meta','noscript','script','style','template','title')) html_elements = { 'html': ('',), 'head': ('html',), 'body': ('html',), # 3.2.5.2.1 "Metadata content" 'base': ("head",), 'link': ("head", "body"), 'meta': ("head", "body"), 'noscript': ("head", "body"), 'script': ("head", "body"), 'style': ("head", "body"), 'template': ("head", "body"), 'title': ("head",), # 3.2.5.2.2 "Flow content" "a": ("body",), "abbr": ("body",), "address": ("body",), "area": ("map",), "article": ("body",), "aside": ("body",), "audio": ("body",), "b": ("body",), "bdi": ("body",), "bdo": ("body",), "blockquote": ("body",), "br": ("body",), "button": ("body",), "canvas": ("body",), "cite": ("body",), "code": ("body",), "data": ("body",), "datalist": ("body",), "del": ("body",), "details": ("body",), "summary": ("details",), #4.11.2 "dfn": ("body",), "dialog": ("body",), "div": ("body",), "dl": ("body",), "em": ("body",), "embed": ("body",), "fieldset": ("body",), "legend": ('fieldset',), #4.10.16 "figure": ("body",), "figcaption": ("figure",), #4.4.13 "footer": ("body",), "form": ("body",), "h1": ("body",), "h2": ("body",), "h3": ("body",), "h4": ("body",), "h5": ("body",), "h6": ("body",), "header": ("body",), "hgroup": ("body",), "hr": ("body",), "i": ("body",), "iframe": ("body",), "img": ("body",), "input": ("body",), "ins": ("body",), "kbd": ("body",), "label": ("body",), "li": ('ol', 'ul', 'menu'), # 4.4.8 "main": ("body",), "map": ("body",), "mark": ("body",), "math": ("body",), "menu": ("body",), "meter": ("body",), "nav": ("body",), "object": ("body",), "param": ('object',), #4.8.8 "ol": ("body",), "output": ("body",), "p": ("body",), "picture": ("body",), "pre": ("body",), "progress": ("body",), "q": ("body",), "ruby": ("body",), "rt": ('ruby',), #4.5.11 "rp": ('ruby',), #4.5.12 "s": ("body",), "samp": ("body",), "section": ("body",), "select": ("body",), "optgroup": ('select',), # 4.10.9 "option": ('select', 'datalist', 'optgroup'), # 4.10.10 "slot": ("body",), "small": ("body",), "source": ("video", "audio",), # embedded element "span": ("body",), "strong": ("body",), "sub": ("body",), "sup": ("body",), "svg": ("body",), "table": ("body",), "caption": ('table',), # 4.9.2 "colgroup": ('table',), # 4.9.3 "col": ('colgroup',), # 4.9.4 "tbody": ('table',), # 4.9.5 "thead": ('table',), # 4.9.6 "tfoot": ('table',), # 4.9.7 "tr": ('thead', 'tbody', 'tfoot', 'table'), # 4.9.8 "td": ("tr",), # 4.9.9 "th": ("tr",), # 4.9.10 "dl": ('body',), #4.4.9 "dt": ('dl',), # 4.4.10 "dd": ('dl',), # 4.4.11 "textarea": ("body",), "time": ("body",), "u": ("body",), "ul": ("body",), "var": ("body",), "video": ("body",), "wbr": ("body",), "track": ('video', 'audio'), } non_recursable = frozenset(('html', 'head', 'body','video','audio', 'noscript', 'form')) # 12.1.2 void_elements = frozenset(('area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr')) global_attributes = frozenset(( # 3.2.6 - Can be for ANY "class", "id", "slot", # 3.2.6 - for HTML elements: "accesskey", "autocapitalize", "contenteditable", "dir", "draggable", "enterkeyhint", "hidden", "inputmode", "is", "itemid", "itemprop", "itemref", "itemscope", "itemtype", "lang", "nonce", "spellcheck", "tabindex", "title", "translate", # JS for any: "onabort", "onauxclick", "onblur", "oncancel", "oncanplay", "oncanplaythrough", "onchange", "onclick", "onclose", "oncontextmenu", "oncopy", "oncuechange", "oncut", "ondblclick", "ondrag", "ondragend", "ondragenter", "ondragexit", "ondragleave", "ondragover", "ondragstart", "ondrop", "ondurationchange", "onemptied", "onended", "onerror", "onfocus", "onformdata", "oninput", "oninvalid", "onkeydown", "onkeypress", "onkeyup", "onload", "onloadeddata", "onloadedmetadata", "onloadend", "onloadstart", "onmousedown", "onmouseenter", "onmouseleave", "onmousemove", "onmouseout", "onmouseover", "onmouseup", "onpaste", "onpause", "onplay", "onplaying", "onprogress", "onratechange", "onreset", "onresize", "onscroll", "onsecuritypolicyviolation", "onseeked", "onseeking", "onselect", "onstalled", "onsubmit", "onsuspend", "ontimeupdate", "ontoggle", "onvolumechange", "onwaiting", "onwheel", # ARIA 'aria-describedby', 'aria-disabled', 'aria-label', 'role', )) # 15.1 element_attribute_warnings={ 'html': ('xmlns', 'xml:lang', 'prefix',), 'script': ('charset', 'language',), 'img': ('border',), 'style': ('type',), 'a': ('name',), } element_attributes={ 'html': #4.1.1 ('manifest',), 'base': # 4.2.3 ('href', 'target'), 'canvas': # 4.12.5 ('width', 'height'), 'link': #4.2.4 ('href', 'crossorigin', 'rel', 'media', 'integrity', 'hreflang', 'type', 'referrerpolicy', 'sizes', 'imgsrcset', 'imagesizes', 'as', 'color'), 'meta': # 4.2.5 ('name', 'http-equiv', 'content', 'charset'), 'style': # 4.2.6 ('media',), 'q': # 4.5.7 ('cite',), 'img': # 4.8.3 ('alt', 'src', 'srcset', 'sizes', 'crossorigin', 'usemap', 'ismap', 'width', 'height', 'referrerpolicy', 'decoding'), 'map': #4.8.13 ('name',), 'area': # 4.8.14 ('alt', 'coords', 'shape', 'href', 'target', 'download', 'ping', 'rel','referrerpolicy'), 'col': #4.9.3 ('span',), 'td': #4.9.9 ('colspan', 'rowspan', 'headers'), 'th': # 4.9.10 ('colspan', 'rowspan', 'headers', 'scope', 'abbr'), 'form': #4.10.3 ('accept-charset', 'action', 'autocomplete', 'enctype', 'method', 'name', 'novalidate', 'target', 'rel'), 'label': #4.10.4 ('for',), 'input': #4.10.5 ('accept', 'alt', 'autocomplete', 'autofocus', 'checked', 'dirname', 'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate', 'formtarget', 'height', 'list', 'max', 'maxlength', 'min', 'minlength', 'multiple', 'name', 'pattern', 'placeholder', 'readonly', 'required', 'size', 'src', 'step', 'type', 'value', 'width'), 'button': # 4.10.6 ('autofocus', 'disabled', 'form', 'formaction', 'formenctype', 'formmethod', 'formnovalidate', 'formtarget', 'name', 'type', 'value'), 'select': #4.10.7 ('autocomplete', 'autofocus', 'disabled', 'form', 'multiple', 'name', 'required', 'size'), 'optgroup': #4.10.9 ('disabled', 'label'), 'option': #4.10.10 ('disabled', 'label', 'selected', 'value'), 'textarea': #4.10.11 ('autocomplete', 'autofocus', 'cols', 'dirname', 'disabled', 'form', 'maxlength', 'minlength', 'name', 'placeholder', 'readonly', 'required', 'rows', 'wrap'), 'output': #4.10.12 ('for', 'form', 'name'), 'progress': #4.10.13 ('value', 'max'), 'meter': #4.10.14 ('value', 'min', 'max', 'low', 'high', 'optimum'), 'fieldset': #4.10.15 ('disabled', 'form', 'name'), 'details': #4.11.1 ('open',), 'object': # 4.8.7 ('data', 'type','name', 'usemap', 'form', 'width', 'height'), 'param': # 4.8.8 ('name', 'value'), 'video': # 4.8.9 ('src','crossorigin','poster','preload','autoplay','playsinline', 'loop','muted','controls','width','height'), 'audio': # 4.8.10 ('src','crossorigin', 'preload', 'autoplay', 'loop', 'muted', 'controls'), 'track': #4.8.11 ('kind', 'src', 'srclang', 'label', 'default'), #### 'body': # 4.3.1 ("onafterprint", "onbeforeprint", "onbeforeunload", "onhashchange", "onlanguagechange", "onmessage", "onmessageerror", "onoffline", "ononline", "onpagehide", "onpageshow", "onpopstate", "onrejectionhandled", "onstorage", "onunhandledrejection", "onunload",), #### 'a': ('href', 'target', 'download', 'ping', 'rel', 'hreflang', 'type', 'referrerpolicy'), ### 'source': ('src', 'type', 'srcset', 'sizes', 'media'), 'script': #4.12.1 ('src', 'type', 'nomodule', 'async', 'defer', 'crossorigin', 'integrity', 'referrerpolicy'), 'li': ('value',) } PARSER = html5lib.HTMLParser(html5lib.treebuilders.getTreeBuilder('dom'), strict=True) class Validator: """ Drills through a html5lib HTML tree, and checks all the elements against various rules. """ def __init__(self, tree): self.tree = tree self._in_doctype = False self._inside = [] # a stack of def __call__(self): """ Actually validate the tree. """ currentNode = self.tree while currentNode is not None: if currentNode.nodeType == Node.DOCUMENT_TYPE_NODE: self.doctype(currentNode.name, currentNode.publicId, currentNode.systemId) elif currentNode.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE): self.text(currentNode.nodeValue) elif currentNode.nodeType == Node.ELEMENT_NODE: if hasattr(currentNode, 'tagName'): if currentNode.tagName in void_elements: self.voidTag(currentNode.tagName, currentNode.attributes) else: self.startTag(currentNode.tagName, currentNode.attributes) elif currentNode.nodeType == Node.COMMENT_NODE: self.comment(currentNode) elif currentNode.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE): self.document_node(currentNode) else: self.unknown(currentNode) # Go on to the next node, closing this one if needed. # NOTE: we don't actually get given the closing tag from html5lib # here - it's already (theoretically) been parsed. if currentNode.firstChild: currentNode = currentNode.firstChild elif currentNode.nextSibling: currentNode = currentNode.nextSibling else: self.endTag(currentNode.parentNode.tagName) currentNode = currentNode.parentNode.nextSibling or None if currentNode == self.tree: break def check_valid_place(self, name): if name in ('html', 'head', 'body') and not self._inside: return True try: required_parents = html_elements[name] except KeyError: raise InvalidTag(f"{name} is not a valid HTML5 tag.") if not self._inside or self._inside == ['html']: if name in metadata_elements: return True if self._inside == ['html', 'head'] and name == 'body': self._inside.pop() if not any(parent in self._inside for parent in required_parents): raise MisplacedElement(f"{name} must be inside {required_parents}") def check_valid_attrs(self, name, attributes): for (k, v) in attributes.items(): if k in global_attributes: continue if k in element_attributes.get(name, ()): continue if k.startswith('data-'): warnings.warn("data-attributes aren't checked for validity yet") continue # TODO if k in element_attribute_warnings.get(name, ()): warnings.warn(f"{name} should NOT have {k}={v} in HTML5.") continue #if k.startswith('aria-'): # continue # TODO are there other possibilities? # TODO: ng-, vue-, other custom attributes? Should be spec'd by # library users. raise InvalidAttribute(f' {k} is not a valid attribute for {name}') def startTag(self, name, attributes): if name in void_elements: raise InvalidTag(f"{name} cannot be used as a Start Tag") if name in non_recursable and name in self._inside: raise MisplacedElement(f"{name} cannot be inside {name}") self.check_valid_place(name) self.check_valid_attrs(name, attributes) self._inside.append(name) return StartTag(name, attributes) def document_node(self, node): self._in_doctype = True def endTag(self, name): if self._inside[-1] == name: self._inside.pop() else: if self._inside == ['html', 'body'] and name == 'html': return raise MisplacedElement(f"End tag for {name} when not inside.") self.check_valid_place(name) return EndTag(name) def voidTag(self, name, attrs, hasChildren=False): self.check_valid_place(name) self.check_valid_attrs(name, attrs) return VoidTag(name, attrs, hasChildren) def text(self, data): try: prefix, mid, suffix = TEXT_MATCH.match(data).groups() except AttributeError: yield Characters(data) return if prefix: yield SpaceCharacters(prefix) if mid: yield Characters(mid) if suffix: yield SpaceCharacters(suffix) def comment(self, data): return Comment(data) def doctype(self, name, publicId=None, systemId=None): self._in_doctype = True return DocType(name, publicId, systemId) def unknown(self, nodeType): raise Exception(f'Unknown! {nodeType}') def validate(text): """ If text is valid HTML5, return None. Otherwise, raise some kind of Parsing or Linting Exception. """ if not text.strip(): raise EmptyPage() dom = PARSER.parse(text) validator = Validator(dom) validator() if __name__ == '__main__': import sys if len(sys.argv) > 1: for f in sys.argv[1:]: with open(f) as fh: validate(fh.read()) else: validate(sys.stdin.read())