#!/usr/bin/env python3
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library. If not, see
# .
#
# Check that external references between documentation HTML files are not broken.
import sys
import os
import argparse
import re
import xml.etree.ElementTree as ET
ns = {'html': 'http://www.w3.org/1999/xhtml'}
externallinks = []
externalimages = []
def get_file_list(prefix):
filelist = []
imagelist = []
imageformats = ['.jpg', '.svg', '.png']
for root, dir, files in os.walk(prefix):
for file in files:
ext = os.path.splitext(file)[1]
if ext == '.html':
# the 404 page doesn't play well
if '404.html' in file:
continue
filelist.append(os.path.join(root, file))
elif ext in imageformats:
imagelist.append(os.path.join(root, file))
filelist.sort()
imagelist.sort()
return filelist, imagelist
# loads an XHTML and extracts all anchors, local and remote links for the one file
def process_file(filename):
tree = ET.parse(filename)
root = tree.getroot()
docname = root.get('data-sourcedoc')
dirname = os.path.dirname(filename)
if not docname:
docname = filename
anchors = [filename]
targets = []
images = []
for elem in root.findall('.//html:a', ns):
target = elem.get('href')
an = elem.get('id')
if an:
anchors.append(filename + '#' + an)
if target:
if re.search('://', target):
externallinks.append(target)
elif target[0] != '#' and 'mailto:' not in target:
targetfull = os.path.normpath(os.path.join(dirname, target))
targets.append((filename, docname, targetfull, target))
# older docutils generate "