libvirt/scripts/check-html-references.py
Peter Krempa 07467e2719 docs: Prohibit 'external' links within the webpage
Enforce that relative links are used within the page, so that local
installations don't require internet conection and/or don't redirect to
the web needlessly.

This is done by looking for any local link (barring exceptions) when
checking links with 'check-html-references.py'.

Signed-off-by: Peter Krempa <pkrempa@redhat.com>
Reviewed-by: Ján Tomko <jtomko@redhat.com>
2024-10-09 16:00:44 +02:00

331 lines
9.1 KiB
Python
Executable File

#!/usr/bin/env python3
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library. If not, see
# <http://www.gnu.org/licenses/>.
#
# Check that external references between documentation HTML files are not broken.
import argparse
import os
import re
import sys
import xml.etree.ElementTree as ET
ns = {'html': 'http://www.w3.org/1999/xhtml'}
externallinks = []
externalimages = []
def get_file_list(prefix):
filelist = []
imagelist = []
imageformats = ['.jpg', '.svg', '.png']
for root, dir, files in os.walk(prefix):
for file in files:
ext = os.path.splitext(file)[1]
if ext == '.html':
# the 404 page doesn't play well
if '404.html' in file:
continue
filelist.append(os.path.join(root, file))
elif ext in imageformats:
imagelist.append(os.path.join(root, file))
filelist.sort()
imagelist.sort()
return filelist, imagelist
# loads an XHTML and extracts all anchors, local and remote links for the one file
def process_file(filename, project_uri):
tree = ET.parse(filename)
root = tree.getroot()
docname = root.get('data-sourcedoc')
dirname = os.path.dirname(filename)
if not docname:
docname = filename
anchors = [filename]
targets = []
images = []
projectlinks = []
for elem in root.findall('.//html:a', ns):
target = elem.get('href')
an = elem.get('id')
if an:
anchors.append(filename + '#' + an)
if target:
if re.search('://', target):
externallinks.append(target)
if project_uri is not None and target.startswith(project_uri):
projectlinks.append((target, docname))
elif target[0] != '#' and 'mailto:' not in target:
targetfull = os.path.normpath(os.path.join(dirname, target))
targets.append((filename, docname, targetfull, target))
# older docutils generate "<div class='section'"
for elem in root.findall('.//html:div/[@class=\'section\']', ns):
an = elem.get('id')
if an:
anchors.append(filename + '#' + an)
# modern docutils generate a <section element
for elem in root.findall('.//html:section', ns):
an = elem.get('id')
if an:
anchors.append(filename + '#' + an)
# find local images
for elem in root.findall('.//html:img', ns):
src = elem.get('src')
if src:
if re.search('://', src):
externalimages.append(src)
else:
imagefull = os.path.normpath(os.path.join(dirname, src))
images.append((imagefull, docname))
return (anchors, targets, images, projectlinks)
def process_all(filelist, project_uri):
anchors = []
targets = []
images = []
projectlinks = []
for file in filelist:
anchor, target, image, projectlink = process_file(file, project_uri)
targets = targets + target
anchors = anchors + anchor
images = images + image
projectlinks = projectlinks + projectlink
return (targets, anchors, images, projectlinks)
def check_targets(targets, anchors):
errors = []
for _, docname, target, targetorig in targets:
if target not in anchors:
errors.append((docname, targetorig))
if errors:
errors.sort()
for file, target in errors:
print(f'ERROR: \'{file}\': broken link to: \'{target}\'')
return True
return False
def check_usage_crawl(page, targets, visited):
visited.append(page)
tocrawl = []
for filename, docname, target, _ in targets:
if page != filename:
continue
targetpage = target.split("#", 1)[0]
if targetpage not in visited and targetpage not in tocrawl:
tocrawl.append(targetpage)
for crawl in tocrawl:
check_usage_crawl(crawl, targets, visited)
# crawls the document references starting from entrypoint and tries to find
# unreachable pages
def check_usage(targets, files, entrypoint):
visited = []
fail = False
check_usage_crawl(entrypoint, targets, visited)
for file in files:
if file not in visited:
brokendoc = file
for filename, docname, _, _ in targets:
if filename != file:
continue
if docname:
brokendoc = docname
break
print(f'ERROR: \'{brokendoc}\': is not referenced from anywhere')
fail = True
return fail
# checks that images present in the directory are being used and also that
# pages link to existing images. For favicons, which are not referenced from
# the '.html' files there's a builtin set of exceptions.
def check_images(usedimages, imagefiles, ignoreimages):
favicons = [
'android-chrome-192x192.png',
'android-chrome-256x256.png',
'apple-touch-icon.png',
'favicon-16x16.png',
'favicon-32x32.png',
'mstile-150x150.png',
]
fail = False
if ignoreimages:
favicons = favicons + ignoreimages
for usedimage, docname in usedimages:
if usedimage not in imagefiles:
print(f'ERROR: \'{docname}\' references image \'{usedimage}\' not among images')
fail = True
for imagefile in imagefiles:
used = False
if imagefile in (usedimage[0] for usedimage in usedimages):
used = True
else:
for favicon in favicons:
if favicon in imagefile:
used = True
break
if not used:
print(f'ERROR: Image \'{imagefile}\' is not used by any page')
fail = True
return fail
# checks that all links are accessed via https
def check_https(links):
fail = False
for link in links:
if link.startswith('http://'):
print(f'ERROR: URI \'{link}\' uses insecure "http" protocol')
fail = True
return fail
# checks prohibited external links to local files
def check_projectlinks(projectlinks, exceptions):
fail = False
for (link, filename) in projectlinks:
allowed = False
if exceptions is not None:
for exc in exceptions:
if exc in filename:
allowed = True
break
if not allowed:
print(f'ERROR: prohibited external URI \'{link}\' to local project in \'{filename}\'')
fail = True
return fail
parser = argparse.ArgumentParser(description='HTML reference checker')
parser.add_argument('--webroot', required=True,
help='path to the web root')
parser.add_argument('--entrypoint', default="index.html",
help='file name of web entry point relative to --webroot')
parser.add_argument('--external', action="store_true",
help='print external references instead')
parser.add_argument('--ignore-images', action='append',
help='paths to images that should be considered as used')
parser.add_argument('--require-https', action="store_true",
help='require secure https for external links')
parser.add_argument('--project-uri',
help='external prefix of the local project (e.g. https://libvirt.org; external links with that prefix are prohibited')
parser.add_argument('--project-uri-exceptions', action='append',
help='list of path prefixes excluded from the "--project-uri" checks')
args = parser.parse_args()
files, imagefiles = get_file_list(os.path.abspath(args.webroot))
entrypoint = os.path.join(os.path.abspath(args.webroot), args.entrypoint)
targets, anchors, usedimages, projectlinks = process_all(files, args.project_uri)
fail = False
if args.external:
prev = None
externallinks.sort()
for ext in externallinks:
if ext != prev:
print(f'link: {ext}')
prev = ext
externalimages.sort()
for ext in externalimages:
if ext != prev:
print(f'image: {ext}')
prev = ext
else:
if check_targets(targets, anchors):
fail = True
if check_usage(targets, files, entrypoint):
fail = True
if check_images(usedimages, imagefiles, args.ignore_images):
fail = True
if check_projectlinks(projectlinks, args.project_uri_exceptions):
fail = True
if args.require_https:
if check_https(externallinks):
fail = True
if check_https(externalimages):
fail = True
if fail:
sys.exit(1)
sys.exit(0)