docs: Add HTML reference checker

In many cases we move around or rename internal anchors which may break
links leading to the content.

docutils handle the case of links inside a document, but we are lacking
the same form of checking between documents.

Introduce a script which cross-checks all the anchors and links in HTML
output files and prints problems and use it as a test case for the
'docs' directory.

Signed-off-by: Peter Krempa <pkrempa@redhat.com>
Reviewed-by: Ján Tomko <jtomko@redhat.com>
This commit is contained in:
Peter Krempa 2022-05-31 15:15:57 +02:00
parent dcff02d797
commit 8452124669
3 changed files with 165 additions and 0 deletions

View File

@ -350,3 +350,14 @@ run_target(
], ],
depends: install_web_deps, depends: install_web_deps,
) )
test(
'check-html-references',
python3_prog,
args: [
check_html_references_prog.path(),
'--prefix',
meson.build_root() / 'docs'
],
env: runutf8,
)

153
scripts/check-html-references.py Executable file
View File

@ -0,0 +1,153 @@
#!/usr/bin/env python3
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library. If not, see
# <http://www.gnu.org/licenses/>.
#
# Check that external references between documentation HTML files are not broken.
import sys
import os
import argparse
import re
import xml.etree.ElementTree as ET
ns = {'html': 'http://www.w3.org/1999/xhtml'}
externallinks = []
def get_file_list(prefix):
filelist = []
for root, dir, files in os.walk(prefix):
prefixbase = os.path.dirname(prefix)
if root.startswith(prefixbase):
relroot = root[len(prefixbase):]
else:
relroot = root
for file in files:
if not re.search('\\.html$', file):
continue
# the 404 page doesn't play well
if '404.html' in file:
continue
fullfilename = os.path.join(root, file)
relfilename = os.path.join(relroot, file)
filelist.append((fullfilename, relfilename))
return filelist
# loads an XHTML and extracts all anchors, local and remote links for the one file
def process_file(filetuple):
filename, relfilename = filetuple
tree = ET.parse(filename)
root = tree.getroot()
anchors = [relfilename]
targets = []
for elem in root.findall('.//html:a', ns):
target = elem.get('href')
an = elem.get('id')
if an:
anchors.append(relfilename + '#' + an)
if target:
if re.search('://', target):
externallinks.append(target)
elif target[0] != '#' and 'mailto:' not in target:
dirname = os.path.dirname(relfilename)
targetname = os.path.normpath(os.path.join(dirname, target))
targets.append((targetname, filename, target))
# older docutils generate "<div class='section'"
for elem in root.findall('.//html:div/[@class=\'section\']', ns):
an = elem.get('id')
if an:
anchors.append(relfilename + '#' + an)
# modern docutils generate a <section element
for elem in root.findall('.//html:section', ns):
an = elem.get('id')
if an:
anchors.append(relfilename + '#' + an)
return (anchors, targets)
def process_all(filelist):
anchors = []
targets = []
for filetuple in filelist:
anchor, target = process_file(filetuple)
targets = targets + target
anchors = anchors + anchor
return (targets, anchors)
def check_targets(targets, anchors):
errors = []
for target, targetfrom, targetorig in targets:
if target not in anchors:
errors.append((targetfrom, targetorig))
if errors:
errors.sort()
print('broken link targets:')
for file, target in errors:
print(file + " broken link: " + target)
return True
return False
parser = argparse.ArgumentParser(description='HTML reference checker')
parser.add_argument('--prefix', default='.',
help='build tree prefix')
parser.add_argument('--external', action="store_true",
help='print external references instead')
args = parser.parse_args()
files = get_file_list(args.prefix)
targets, anchors = process_all(files)
if args.external:
prev = None
externallinks.sort()
for ext in externallinks:
if ext != prev:
print(ext)
prev = ext
else:
if check_targets(targets, anchors):
sys.exit(1)
sys.exit(0)

View File

@ -6,6 +6,7 @@ scripts = [
'check-driverimpls.py', 'check-driverimpls.py',
'check-drivername.py', 'check-drivername.py',
'check-file-access.py', 'check-file-access.py',
'check-html-references.py',
'check-remote-protocol.py', 'check-remote-protocol.py',
'check-symfile.py', 'check-symfile.py',
'check-symsorting.py', 'check-symsorting.py',