2022-05-31 15:15:57 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
#
|
|
|
|
# This library is free software; you can redistribute it and/or
|
|
|
|
# modify it under the terms of the GNU Lesser General Public
|
|
|
|
# License as published by the Free Software Foundation; either
|
|
|
|
# version 2.1 of the License, or (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This library is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
# Lesser General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU Lesser General Public
|
|
|
|
# License along with this library. If not, see
|
|
|
|
# <http://www.gnu.org/licenses/>.
|
|
|
|
#
|
|
|
|
# Check that external references between documentation HTML files are not broken.
|
|
|
|
|
|
|
|
import argparse
|
2023-06-05 15:40:13 +08:00
|
|
|
import os
|
2022-05-31 15:15:57 +02:00
|
|
|
import re
|
2023-06-05 15:40:13 +08:00
|
|
|
import sys
|
2022-05-31 15:15:57 +02:00
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
|
|
|
|
ns = {'html': 'http://www.w3.org/1999/xhtml'}
|
|
|
|
externallinks = []
|
2023-02-14 14:38:40 +01:00
|
|
|
externalimages = []
|
2022-05-31 15:15:57 +02:00
|
|
|
|
|
|
|
|
|
|
|
def get_file_list(prefix):
|
|
|
|
filelist = []
|
2023-02-14 14:38:40 +01:00
|
|
|
imagelist = []
|
|
|
|
imageformats = ['.jpg', '.svg', '.png']
|
2022-05-31 15:15:57 +02:00
|
|
|
|
|
|
|
for root, dir, files in os.walk(prefix):
|
|
|
|
for file in files:
|
2023-02-14 14:38:40 +01:00
|
|
|
ext = os.path.splitext(file)[1]
|
2022-05-31 15:15:57 +02:00
|
|
|
|
2023-02-14 14:38:40 +01:00
|
|
|
if ext == '.html':
|
|
|
|
# the 404 page doesn't play well
|
|
|
|
if '404.html' in file:
|
|
|
|
continue
|
|
|
|
|
|
|
|
filelist.append(os.path.join(root, file))
|
2022-05-31 15:15:57 +02:00
|
|
|
|
2023-02-14 14:38:40 +01:00
|
|
|
elif ext in imageformats:
|
|
|
|
imagelist.append(os.path.join(root, file))
|
2022-05-31 15:15:57 +02:00
|
|
|
|
2023-02-14 13:14:25 +01:00
|
|
|
filelist.sort()
|
2023-02-14 14:38:40 +01:00
|
|
|
imagelist.sort()
|
2023-02-14 13:14:25 +01:00
|
|
|
|
2023-02-14 14:38:40 +01:00
|
|
|
return filelist, imagelist
|
2022-05-31 15:15:57 +02:00
|
|
|
|
|
|
|
|
|
|
|
# loads an XHTML and extracts all anchors, local and remote links for the one file
|
2024-10-08 15:06:17 +02:00
|
|
|
def process_file(filename, project_uri):
|
2022-05-31 15:15:57 +02:00
|
|
|
tree = ET.parse(filename)
|
|
|
|
root = tree.getroot()
|
2023-02-14 12:35:23 +01:00
|
|
|
docname = root.get('data-sourcedoc')
|
2023-02-14 14:38:40 +01:00
|
|
|
dirname = os.path.dirname(filename)
|
2022-05-31 15:15:57 +02:00
|
|
|
|
2023-02-14 12:35:23 +01:00
|
|
|
if not docname:
|
|
|
|
docname = filename
|
|
|
|
|
|
|
|
anchors = [filename]
|
2022-05-31 15:15:57 +02:00
|
|
|
targets = []
|
2023-02-14 14:38:40 +01:00
|
|
|
images = []
|
2024-10-08 15:06:17 +02:00
|
|
|
projectlinks = []
|
2022-05-31 15:15:57 +02:00
|
|
|
|
|
|
|
for elem in root.findall('.//html:a', ns):
|
|
|
|
target = elem.get('href')
|
|
|
|
an = elem.get('id')
|
|
|
|
|
|
|
|
if an:
|
2023-02-14 12:35:23 +01:00
|
|
|
anchors.append(filename + '#' + an)
|
2022-05-31 15:15:57 +02:00
|
|
|
|
|
|
|
if target:
|
|
|
|
if re.search('://', target):
|
|
|
|
externallinks.append(target)
|
2024-10-08 15:06:17 +02:00
|
|
|
|
|
|
|
if project_uri is not None and target.startswith(project_uri):
|
|
|
|
projectlinks.append((target, docname))
|
|
|
|
|
2022-05-31 15:15:57 +02:00
|
|
|
elif target[0] != '#' and 'mailto:' not in target:
|
2023-02-14 12:35:23 +01:00
|
|
|
targetfull = os.path.normpath(os.path.join(dirname, target))
|
2022-05-31 15:15:57 +02:00
|
|
|
|
2023-02-14 12:35:23 +01:00
|
|
|
targets.append((filename, docname, targetfull, target))
|
2022-05-31 15:15:57 +02:00
|
|
|
|
|
|
|
# older docutils generate "<div class='section'"
|
|
|
|
for elem in root.findall('.//html:div/[@class=\'section\']', ns):
|
|
|
|
an = elem.get('id')
|
|
|
|
|
|
|
|
if an:
|
2023-02-14 12:35:23 +01:00
|
|
|
anchors.append(filename + '#' + an)
|
2022-05-31 15:15:57 +02:00
|
|
|
|
|
|
|
# modern docutils generate a <section element
|
|
|
|
for elem in root.findall('.//html:section', ns):
|
|
|
|
an = elem.get('id')
|
|
|
|
|
|
|
|
if an:
|
2023-02-14 12:35:23 +01:00
|
|
|
anchors.append(filename + '#' + an)
|
2022-05-31 15:15:57 +02:00
|
|
|
|
2023-02-14 14:38:40 +01:00
|
|
|
# find local images
|
|
|
|
for elem in root.findall('.//html:img', ns):
|
|
|
|
src = elem.get('src')
|
|
|
|
|
|
|
|
if src:
|
|
|
|
if re.search('://', src):
|
|
|
|
externalimages.append(src)
|
|
|
|
else:
|
|
|
|
imagefull = os.path.normpath(os.path.join(dirname, src))
|
|
|
|
images.append((imagefull, docname))
|
|
|
|
|
2024-10-08 15:06:17 +02:00
|
|
|
return (anchors, targets, images, projectlinks)
|
2022-05-31 15:15:57 +02:00
|
|
|
|
|
|
|
|
2024-10-08 15:06:17 +02:00
|
|
|
def process_all(filelist, project_uri):
|
2022-05-31 15:15:57 +02:00
|
|
|
anchors = []
|
|
|
|
targets = []
|
2023-02-14 14:38:40 +01:00
|
|
|
images = []
|
2024-10-08 15:06:17 +02:00
|
|
|
projectlinks = []
|
2022-05-31 15:15:57 +02:00
|
|
|
|
2023-02-14 12:35:23 +01:00
|
|
|
for file in filelist:
|
2024-10-08 15:06:17 +02:00
|
|
|
anchor, target, image, projectlink = process_file(file, project_uri)
|
2022-05-31 15:15:57 +02:00
|
|
|
|
|
|
|
targets = targets + target
|
|
|
|
anchors = anchors + anchor
|
2023-02-14 14:38:40 +01:00
|
|
|
images = images + image
|
2024-10-08 15:06:17 +02:00
|
|
|
projectlinks = projectlinks + projectlink
|
2022-05-31 15:15:57 +02:00
|
|
|
|
2024-10-08 15:06:17 +02:00
|
|
|
return (targets, anchors, images, projectlinks)
|
2022-05-31 15:15:57 +02:00
|
|
|
|
|
|
|
|
|
|
|
def check_targets(targets, anchors):
|
|
|
|
errors = []
|
2023-02-14 12:35:23 +01:00
|
|
|
for _, docname, target, targetorig in targets:
|
2022-05-31 15:15:57 +02:00
|
|
|
if target not in anchors:
|
2023-02-14 12:35:23 +01:00
|
|
|
errors.append((docname, targetorig))
|
2022-05-31 15:15:57 +02:00
|
|
|
|
|
|
|
if errors:
|
|
|
|
errors.sort()
|
|
|
|
|
|
|
|
for file, target in errors:
|
2023-02-14 12:35:23 +01:00
|
|
|
print(f'ERROR: \'{file}\': broken link to: \'{target}\'')
|
2022-05-31 15:15:57 +02:00
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2023-02-14 13:14:25 +01:00
|
|
|
def check_usage_crawl(page, targets, visited):
|
|
|
|
visited.append(page)
|
|
|
|
|
|
|
|
tocrawl = []
|
|
|
|
|
|
|
|
for filename, docname, target, _ in targets:
|
|
|
|
if page != filename:
|
|
|
|
continue
|
|
|
|
|
|
|
|
targetpage = target.split("#", 1)[0]
|
|
|
|
|
|
|
|
if targetpage not in visited and targetpage not in tocrawl:
|
|
|
|
tocrawl.append(targetpage)
|
|
|
|
|
|
|
|
for crawl in tocrawl:
|
|
|
|
check_usage_crawl(crawl, targets, visited)
|
|
|
|
|
|
|
|
|
|
|
|
# crawls the document references starting from entrypoint and tries to find
|
|
|
|
# unreachable pages
|
|
|
|
def check_usage(targets, files, entrypoint):
|
|
|
|
visited = []
|
|
|
|
fail = False
|
|
|
|
|
|
|
|
check_usage_crawl(entrypoint, targets, visited)
|
|
|
|
|
|
|
|
for file in files:
|
|
|
|
if file not in visited:
|
|
|
|
brokendoc = file
|
|
|
|
|
|
|
|
for filename, docname, _, _ in targets:
|
|
|
|
if filename != file:
|
|
|
|
continue
|
|
|
|
if docname:
|
|
|
|
brokendoc = docname
|
|
|
|
break
|
|
|
|
|
|
|
|
print(f'ERROR: \'{brokendoc}\': is not referenced from anywhere')
|
|
|
|
fail = True
|
|
|
|
|
|
|
|
return fail
|
|
|
|
|
|
|
|
|
2023-02-14 14:38:40 +01:00
|
|
|
# checks that images present in the directory are being used and also that
|
|
|
|
# pages link to existing images. For favicons, which are not referenced from
|
|
|
|
# the '.html' files there's a builtin set of exceptions.
|
|
|
|
def check_images(usedimages, imagefiles, ignoreimages):
|
|
|
|
favicons = [
|
|
|
|
'android-chrome-192x192.png',
|
|
|
|
'android-chrome-256x256.png',
|
|
|
|
'apple-touch-icon.png',
|
|
|
|
'favicon-16x16.png',
|
|
|
|
'favicon-32x32.png',
|
|
|
|
'mstile-150x150.png',
|
|
|
|
]
|
|
|
|
fail = False
|
|
|
|
|
|
|
|
if ignoreimages:
|
|
|
|
favicons = favicons + ignoreimages
|
|
|
|
|
|
|
|
for usedimage, docname in usedimages:
|
|
|
|
if usedimage not in imagefiles:
|
|
|
|
print(f'ERROR: \'{docname}\' references image \'{usedimage}\' not among images')
|
|
|
|
fail = True
|
|
|
|
|
|
|
|
for imagefile in imagefiles:
|
|
|
|
used = False
|
|
|
|
|
|
|
|
if imagefile in (usedimage[0] for usedimage in usedimages):
|
|
|
|
used = True
|
|
|
|
else:
|
|
|
|
for favicon in favicons:
|
|
|
|
if favicon in imagefile:
|
|
|
|
used = True
|
|
|
|
break
|
|
|
|
|
|
|
|
if not used:
|
|
|
|
print(f'ERROR: Image \'{imagefile}\' is not used by any page')
|
|
|
|
fail = True
|
|
|
|
|
|
|
|
return fail
|
|
|
|
|
|
|
|
|
2024-10-08 13:38:34 +02:00
|
|
|
# checks that all links are accessed via https
|
|
|
|
def check_https(links):
|
|
|
|
fail = False
|
|
|
|
|
|
|
|
for link in links:
|
|
|
|
if link.startswith('http://'):
|
|
|
|
print(f'ERROR: URI \'{link}\' uses insecure "http" protocol')
|
|
|
|
fail = True
|
|
|
|
|
|
|
|
return fail
|
|
|
|
|
|
|
|
|
2024-10-08 15:06:17 +02:00
|
|
|
# checks prohibited external links to local files
|
|
|
|
def check_projectlinks(projectlinks, exceptions):
|
|
|
|
fail = False
|
|
|
|
|
|
|
|
for (link, filename) in projectlinks:
|
|
|
|
allowed = False
|
|
|
|
|
|
|
|
if exceptions is not None:
|
|
|
|
for exc in exceptions:
|
|
|
|
if exc in filename:
|
|
|
|
allowed = True
|
|
|
|
break
|
|
|
|
|
|
|
|
if not allowed:
|
|
|
|
print(f'ERROR: prohibited external URI \'{link}\' to local project in \'{filename}\'')
|
|
|
|
fail = True
|
|
|
|
|
|
|
|
return fail
|
|
|
|
|
|
|
|
|
2022-05-31 15:15:57 +02:00
|
|
|
parser = argparse.ArgumentParser(description='HTML reference checker')
|
2023-02-14 12:05:30 +01:00
|
|
|
parser.add_argument('--webroot', required=True,
|
|
|
|
help='path to the web root')
|
2023-02-14 13:14:25 +01:00
|
|
|
parser.add_argument('--entrypoint', default="index.html",
|
|
|
|
help='file name of web entry point relative to --webroot')
|
2022-05-31 15:15:57 +02:00
|
|
|
parser.add_argument('--external', action="store_true",
|
|
|
|
help='print external references instead')
|
2023-02-14 14:38:40 +01:00
|
|
|
parser.add_argument('--ignore-images', action='append',
|
|
|
|
help='paths to images that should be considered as used')
|
2024-10-08 13:38:34 +02:00
|
|
|
parser.add_argument('--require-https', action="store_true",
|
|
|
|
help='require secure https for external links')
|
2024-10-08 15:06:17 +02:00
|
|
|
parser.add_argument('--project-uri',
|
|
|
|
help='external prefix of the local project (e.g. https://libvirt.org; external links with that prefix are prohibited')
|
|
|
|
parser.add_argument('--project-uri-exceptions', action='append',
|
|
|
|
help='list of path prefixes excluded from the "--project-uri" checks')
|
2022-05-31 15:15:57 +02:00
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
2023-02-14 14:38:40 +01:00
|
|
|
files, imagefiles = get_file_list(os.path.abspath(args.webroot))
|
2022-05-31 15:15:57 +02:00
|
|
|
|
2023-02-14 13:14:25 +01:00
|
|
|
entrypoint = os.path.join(os.path.abspath(args.webroot), args.entrypoint)
|
|
|
|
|
2024-10-08 15:06:17 +02:00
|
|
|
targets, anchors, usedimages, projectlinks = process_all(files, args.project_uri)
|
2022-05-31 15:15:57 +02:00
|
|
|
|
2023-02-14 13:14:25 +01:00
|
|
|
fail = False
|
|
|
|
|
2022-05-31 15:15:57 +02:00
|
|
|
if args.external:
|
|
|
|
prev = None
|
|
|
|
externallinks.sort()
|
|
|
|
for ext in externallinks:
|
|
|
|
if ext != prev:
|
2023-02-14 14:38:40 +01:00
|
|
|
print(f'link: {ext}')
|
|
|
|
|
|
|
|
prev = ext
|
|
|
|
|
|
|
|
externalimages.sort()
|
|
|
|
for ext in externalimages:
|
|
|
|
if ext != prev:
|
|
|
|
print(f'image: {ext}')
|
2022-05-31 15:15:57 +02:00
|
|
|
|
|
|
|
prev = ext
|
|
|
|
else:
|
|
|
|
if check_targets(targets, anchors):
|
2023-02-14 13:14:25 +01:00
|
|
|
fail = True
|
|
|
|
|
|
|
|
if check_usage(targets, files, entrypoint):
|
|
|
|
fail = True
|
|
|
|
|
2023-02-14 14:38:40 +01:00
|
|
|
if check_images(usedimages, imagefiles, args.ignore_images):
|
|
|
|
fail = True
|
|
|
|
|
2024-10-08 15:06:17 +02:00
|
|
|
if check_projectlinks(projectlinks, args.project_uri_exceptions):
|
|
|
|
fail = True
|
|
|
|
|
2024-10-08 13:38:34 +02:00
|
|
|
if args.require_https:
|
|
|
|
if check_https(externallinks):
|
|
|
|
fail = True
|
|
|
|
|
|
|
|
if check_https(externalimages):
|
|
|
|
fail = True
|
|
|
|
|
2023-02-14 13:14:25 +01:00
|
|
|
if fail:
|
2022-05-31 15:15:57 +02:00
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
sys.exit(0)
|