#!/usr/bin/env python3 # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library. If not, see # <http://www.gnu.org/licenses/>. # # Check that external references between documentation HTML files are not broken. import argparse import os import re import sys import xml.etree.ElementTree as ET ns = {'html': 'http://www.w3.org/1999/xhtml'} externallinks = [] externalimages = [] def get_file_list(prefix): filelist = [] imagelist = [] imageformats = ['.jpg', '.svg', '.png'] for root, dir, files in os.walk(prefix): for file in files: ext = os.path.splitext(file)[1] if ext == '.html': # the 404 page doesn't play well if '404.html' in file: continue filelist.append(os.path.join(root, file)) elif ext in imageformats: imagelist.append(os.path.join(root, file)) filelist.sort() imagelist.sort() return filelist, imagelist # loads an XHTML and extracts all anchors, local and remote links for the one file def process_file(filename, project_uri): tree = ET.parse(filename) root = tree.getroot() docname = root.get('data-sourcedoc') dirname = os.path.dirname(filename) if not docname: docname = filename anchors = [filename] targets = [] images = [] projectlinks = [] for elem in root.findall('.//html:a', ns): target = elem.get('href') an = elem.get('id') if an: anchors.append(filename + '#' + an) if target: if re.search('://', target): externallinks.append(target) if project_uri is not None and target.startswith(project_uri): projectlinks.append((target, docname)) elif target[0] != '#' and 'mailto:' not in target: targetfull = os.path.normpath(os.path.join(dirname, target)) targets.append((filename, docname, targetfull, target)) # older docutils generate "<div class='section'" for elem in root.findall('.//html:div/[@class=\'section\']', ns): an = elem.get('id') if an: anchors.append(filename + '#' + an) # modern docutils generate a <section element for elem in root.findall('.//html:section', ns): an = elem.get('id') if an: anchors.append(filename + '#' + an) # find local images for elem in root.findall('.//html:img', ns): src = elem.get('src') if src: if re.search('://', src): externalimages.append(src) else: imagefull = os.path.normpath(os.path.join(dirname, src)) images.append((imagefull, docname)) return (anchors, targets, images, projectlinks) def process_all(filelist, project_uri): anchors = [] targets = [] images = [] projectlinks = [] for file in filelist: anchor, target, image, projectlink = process_file(file, project_uri) targets = targets + target anchors = anchors + anchor images = images + image projectlinks = projectlinks + projectlink return (targets, anchors, images, projectlinks) def check_targets(targets, anchors): errors = [] for _, docname, target, targetorig in targets: if target not in anchors: errors.append((docname, targetorig)) if errors: errors.sort() for file, target in errors: print(f'ERROR: \'{file}\': broken link to: \'{target}\'') return True return False def check_usage_crawl(page, targets, visited): visited.append(page) tocrawl = [] for filename, docname, target, _ in targets: if page != filename: continue targetpage = target.split("#", 1)[0] if targetpage not in visited and targetpage not in tocrawl: tocrawl.append(targetpage) for crawl in tocrawl: check_usage_crawl(crawl, targets, visited) # crawls the document references starting from entrypoint and tries to find # unreachable pages def check_usage(targets, files, entrypoint): visited = [] fail = False check_usage_crawl(entrypoint, targets, visited) for file in files: if file not in visited: brokendoc = file for filename, docname, _, _ in targets: if filename != file: continue if docname: brokendoc = docname break print(f'ERROR: \'{brokendoc}\': is not referenced from anywhere') fail = True return fail # checks that images present in the directory are being used and also that # pages link to existing images. For favicons, which are not referenced from # the '.html' files there's a builtin set of exceptions. def check_images(usedimages, imagefiles, ignoreimages): favicons = [ 'android-chrome-192x192.png', 'android-chrome-256x256.png', 'apple-touch-icon.png', 'favicon-16x16.png', 'favicon-32x32.png', 'mstile-150x150.png', ] fail = False if ignoreimages: favicons = favicons + ignoreimages for usedimage, docname in usedimages: if usedimage not in imagefiles: print(f'ERROR: \'{docname}\' references image \'{usedimage}\' not among images') fail = True for imagefile in imagefiles: used = False if imagefile in (usedimage[0] for usedimage in usedimages): used = True else: for favicon in favicons: if favicon in imagefile: used = True break if not used: print(f'ERROR: Image \'{imagefile}\' is not used by any page') fail = True return fail # checks that all links are accessed via https def check_https(links): fail = False for link in links: if link.startswith('http://'): print(f'ERROR: URI \'{link}\' uses insecure "http" protocol') fail = True return fail # checks prohibited external links to local files def check_projectlinks(projectlinks, exceptions): fail = False for (link, filename) in projectlinks: allowed = False if exceptions is not None: for exc in exceptions: if exc in filename: allowed = True break if not allowed: print(f'ERROR: prohibited external URI \'{link}\' to local project in \'{filename}\'') fail = True return fail parser = argparse.ArgumentParser(description='HTML reference checker') parser.add_argument('--webroot', required=True, help='path to the web root') parser.add_argument('--entrypoint', default="index.html", help='file name of web entry point relative to --webroot') parser.add_argument('--external', action="store_true", help='print external references instead') parser.add_argument('--ignore-images', action='append', help='paths to images that should be considered as used') parser.add_argument('--require-https', action="store_true", help='require secure https for external links') parser.add_argument('--project-uri', help='external prefix of the local project (e.g. https://libvirt.org; external links with that prefix are prohibited') parser.add_argument('--project-uri-exceptions', action='append', help='list of path prefixes excluded from the "--project-uri" checks') args = parser.parse_args() files, imagefiles = get_file_list(os.path.abspath(args.webroot)) entrypoint = os.path.join(os.path.abspath(args.webroot), args.entrypoint) targets, anchors, usedimages, projectlinks = process_all(files, args.project_uri) fail = False if args.external: prev = None externallinks.sort() for ext in externallinks: if ext != prev: print(f'link: {ext}') prev = ext externalimages.sort() for ext in externalimages: if ext != prev: print(f'image: {ext}') prev = ext else: if check_targets(targets, anchors): fail = True if check_usage(targets, files, entrypoint): fail = True if check_images(usedimages, imagefiles, args.ignore_images): fail = True if check_projectlinks(projectlinks, args.project_uri_exceptions): fail = True if args.require_https: if check_https(externallinks): fail = True if check_https(externalimages): fail = True if fail: sys.exit(1) sys.exit(0)