diff --git a/docs/meson.build b/docs/meson.build index d71f6006dd..cb70ef6084 100644 --- a/docs/meson.build +++ b/docs/meson.build @@ -350,3 +350,14 @@ run_target( ], depends: install_web_deps, ) + +test( + 'check-html-references', + python3_prog, + args: [ + check_html_references_prog.path(), + '--prefix', + meson.build_root() / 'docs' + ], + env: runutf8, +) diff --git a/scripts/check-html-references.py b/scripts/check-html-references.py new file mode 100755 index 0000000000..95a61a6bb4 --- /dev/null +++ b/scripts/check-html-references.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library. If not, see +# . +# +# Check that external references between documentation HTML files are not broken. + +import sys +import os +import argparse +import re +import xml.etree.ElementTree as ET + +ns = {'html': 'http://www.w3.org/1999/xhtml'} +externallinks = [] + + +def get_file_list(prefix): + filelist = [] + + for root, dir, files in os.walk(prefix): + prefixbase = os.path.dirname(prefix) + + if root.startswith(prefixbase): + relroot = root[len(prefixbase):] + else: + relroot = root + + for file in files: + if not re.search('\\.html$', file): + continue + + # the 404 page doesn't play well + if '404.html' in file: + continue + + fullfilename = os.path.join(root, file) + relfilename = os.path.join(relroot, file) + filelist.append((fullfilename, relfilename)) + + return filelist + + +# loads an XHTML and extracts all anchors, local and remote links for the one file +def process_file(filetuple): + filename, relfilename = filetuple + tree = ET.parse(filename) + root = tree.getroot() + + anchors = [relfilename] + targets = [] + + for elem in root.findall('.//html:a', ns): + target = elem.get('href') + an = elem.get('id') + + if an: + anchors.append(relfilename + '#' + an) + + if target: + if re.search('://', target): + externallinks.append(target) + elif target[0] != '#' and 'mailto:' not in target: + dirname = os.path.dirname(relfilename) + targetname = os.path.normpath(os.path.join(dirname, target)) + + targets.append((targetname, filename, target)) + + # older docutils generate "