#!/usr/bin/env python3 # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library. If not, see # . # # Check that external references between documentation HTML files are not broken. import sys import os import argparse import re import xml.etree.ElementTree as ET ns = {'html': 'http://www.w3.org/1999/xhtml'} externallinks = [] def get_file_list(prefix): filelist = [] for root, dir, files in os.walk(prefix): prefixbase = os.path.dirname(prefix) if root.startswith(prefixbase): relroot = root[len(prefixbase):] else: relroot = root for file in files: if not re.search('\\.html$', file): continue # the 404 page doesn't play well if '404.html' in file: continue fullfilename = os.path.join(root, file) relfilename = os.path.join(relroot, file) filelist.append((fullfilename, relfilename)) return filelist # loads an XHTML and extracts all anchors, local and remote links for the one file def process_file(filetuple): filename, relfilename = filetuple tree = ET.parse(filename) root = tree.getroot() anchors = [relfilename] targets = [] for elem in root.findall('.//html:a', ns): target = elem.get('href') an = elem.get('id') if an: anchors.append(relfilename + '#' + an) if target: if re.search('://', target): externallinks.append(target) elif target[0] != '#' and 'mailto:' not in target: dirname = os.path.dirname(relfilename) targetname = os.path.normpath(os.path.join(dirname, target)) targets.append((targetname, filename, target)) # older docutils generate "