#!/usr/bin/env python3 # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library. If not, see # . # # Check that external references between documentation HTML files are not broken. import sys import os import argparse import re import xml.etree.ElementTree as ET ns = {'html': 'http://www.w3.org/1999/xhtml'} externallinks = [] externalimages = [] def get_file_list(prefix): filelist = [] imagelist = [] imageformats = ['.jpg', '.svg', '.png'] for root, dir, files in os.walk(prefix): for file in files: ext = os.path.splitext(file)[1] if ext == '.html': # the 404 page doesn't play well if '404.html' in file: continue filelist.append(os.path.join(root, file)) elif ext in imageformats: imagelist.append(os.path.join(root, file)) filelist.sort() imagelist.sort() return filelist, imagelist # loads an XHTML and extracts all anchors, local and remote links for the one file def process_file(filename): tree = ET.parse(filename) root = tree.getroot() docname = root.get('data-sourcedoc') dirname = os.path.dirname(filename) if not docname: docname = filename anchors = [filename] targets = [] images = [] for elem in root.findall('.//html:a', ns): target = elem.get('href') an = elem.get('id') if an: anchors.append(filename + '#' + an) if target: if re.search('://', target): externallinks.append(target) elif target[0] != '#' and 'mailto:' not in target: targetfull = os.path.normpath(os.path.join(dirname, target)) targets.append((filename, docname, targetfull, target)) # older docutils generate "