#!/usr/bin/env python3 # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library. If not, see # <http://www.gnu.org/licenses/>. # # Check that external references between documentation HTML files are not broken. import sys import os import argparse import re import xml.etree.ElementTree as ET ns = {'html': 'http://www.w3.org/1999/xhtml'} externallinks = [] def get_file_list(prefix): filelist = [] for root, dir, files in os.walk(prefix): prefixbase = os.path.dirname(prefix) if root.startswith(prefixbase): relroot = root[len(prefixbase):] else: relroot = root for file in files: if not re.search('\\.html$', file): continue # the 404 page doesn't play well if '404.html' in file: continue fullfilename = os.path.join(root, file) relfilename = os.path.join(relroot, file) filelist.append((fullfilename, relfilename)) return filelist # loads an XHTML and extracts all anchors, local and remote links for the one file def process_file(filetuple): filename, relfilename = filetuple tree = ET.parse(filename) root = tree.getroot() anchors = [relfilename] targets = [] for elem in root.findall('.//html:a', ns): target = elem.get('href') an = elem.get('id') if an: anchors.append(relfilename + '#' + an) if target: if re.search('://', target): externallinks.append(target) elif target[0] != '#' and 'mailto:' not in target: dirname = os.path.dirname(relfilename) targetname = os.path.normpath(os.path.join(dirname, target)) targets.append((targetname, filename, target)) # older docutils generate "<div class='section'" for elem in root.findall('.//html:div/[@class=\'section\']', ns): an = elem.get('id') if an: anchors.append(relfilename + '#' + an) # modern docutils generate a <section element for elem in root.findall('.//html:section', ns): an = elem.get('id') if an: anchors.append(relfilename + '#' + an) return (anchors, targets) def process_all(filelist): anchors = [] targets = [] for filetuple in filelist: anchor, target = process_file(filetuple) targets = targets + target anchors = anchors + anchor return (targets, anchors) def check_targets(targets, anchors): errors = [] for target, targetfrom, targetorig in targets: if target not in anchors: errors.append((targetfrom, targetorig)) if errors: errors.sort() print('broken link targets:') for file, target in errors: print(file + " broken link: " + target) return True return False parser = argparse.ArgumentParser(description='HTML reference checker') parser.add_argument('--prefix', default='.', help='build tree prefix') parser.add_argument('--external', action="store_true", help='print external references instead') args = parser.parse_args() files = get_file_list(args.prefix) targets, anchors = process_all(files) if args.external: prev = None externallinks.sort() for ext in externallinks: if ext != prev: print(ext) prev = ext else: if check_targets(targets, anchors): sys.exit(1) sys.exit(0)