docs: Prohibit 'external' links within the webpage

Enforce that relative links are used within the page, so that local
installations don't require internet conection and/or don't redirect to
the web needlessly.

This is done by looking for any local link (barring exceptions) when
checking links with 'check-html-references.py'.

Signed-off-by: Peter Krempa <pkrempa@redhat.com>
Reviewed-by: Ján Tomko <jtomko@redhat.com>
This commit is contained in:
Peter Krempa 2024-10-08 15:06:17 +02:00
parent dcc8deb536
commit 07467e2719
2 changed files with 43 additions and 6 deletions

View File

@ -359,6 +359,9 @@ if tests_enabled[0]
args: [ args: [
check_html_references_prog.full_path(), check_html_references_prog.full_path(),
'--require-https', '--require-https',
'--project-uri', 'https://libvirt.org',
'--project-uri-exceptions', 'docs/manpages/',
'--project-uri-exceptions', 'docs/html/',
'--webroot', '--webroot',
meson.project_build_root() / 'docs' meson.project_build_root() / 'docs'
], ],

View File

@ -53,7 +53,7 @@ def get_file_list(prefix):
# loads an XHTML and extracts all anchors, local and remote links for the one file # loads an XHTML and extracts all anchors, local and remote links for the one file
def process_file(filename): def process_file(filename, project_uri):
tree = ET.parse(filename) tree = ET.parse(filename)
root = tree.getroot() root = tree.getroot()
docname = root.get('data-sourcedoc') docname = root.get('data-sourcedoc')
@ -65,6 +65,7 @@ def process_file(filename):
anchors = [filename] anchors = [filename]
targets = [] targets = []
images = [] images = []
projectlinks = []
for elem in root.findall('.//html:a', ns): for elem in root.findall('.//html:a', ns):
target = elem.get('href') target = elem.get('href')
@ -76,6 +77,10 @@ def process_file(filename):
if target: if target:
if re.search('://', target): if re.search('://', target):
externallinks.append(target) externallinks.append(target)
if project_uri is not None and target.startswith(project_uri):
projectlinks.append((target, docname))
elif target[0] != '#' and 'mailto:' not in target: elif target[0] != '#' and 'mailto:' not in target:
targetfull = os.path.normpath(os.path.join(dirname, target)) targetfull = os.path.normpath(os.path.join(dirname, target))
@ -106,22 +111,24 @@ def process_file(filename):
imagefull = os.path.normpath(os.path.join(dirname, src)) imagefull = os.path.normpath(os.path.join(dirname, src))
images.append((imagefull, docname)) images.append((imagefull, docname))
return (anchors, targets, images) return (anchors, targets, images, projectlinks)
def process_all(filelist): def process_all(filelist, project_uri):
anchors = [] anchors = []
targets = [] targets = []
images = [] images = []
projectlinks = []
for file in filelist: for file in filelist:
anchor, target, image = process_file(file) anchor, target, image, projectlink = process_file(file, project_uri)
targets = targets + target targets = targets + target
anchors = anchors + anchor anchors = anchors + anchor
images = images + image images = images + image
projectlinks = projectlinks + projectlink
return (targets, anchors, images) return (targets, anchors, images, projectlinks)
def check_targets(targets, anchors): def check_targets(targets, anchors):
@ -236,6 +243,26 @@ def check_https(links):
return fail return fail
# checks prohibited external links to local files
def check_projectlinks(projectlinks, exceptions):
fail = False
for (link, filename) in projectlinks:
allowed = False
if exceptions is not None:
for exc in exceptions:
if exc in filename:
allowed = True
break
if not allowed:
print(f'ERROR: prohibited external URI \'{link}\' to local project in \'{filename}\'')
fail = True
return fail
parser = argparse.ArgumentParser(description='HTML reference checker') parser = argparse.ArgumentParser(description='HTML reference checker')
parser.add_argument('--webroot', required=True, parser.add_argument('--webroot', required=True,
help='path to the web root') help='path to the web root')
@ -247,6 +274,10 @@ parser.add_argument('--ignore-images', action='append',
help='paths to images that should be considered as used') help='paths to images that should be considered as used')
parser.add_argument('--require-https', action="store_true", parser.add_argument('--require-https', action="store_true",
help='require secure https for external links') help='require secure https for external links')
parser.add_argument('--project-uri',
help='external prefix of the local project (e.g. https://libvirt.org; external links with that prefix are prohibited')
parser.add_argument('--project-uri-exceptions', action='append',
help='list of path prefixes excluded from the "--project-uri" checks')
args = parser.parse_args() args = parser.parse_args()
@ -254,7 +285,7 @@ files, imagefiles = get_file_list(os.path.abspath(args.webroot))
entrypoint = os.path.join(os.path.abspath(args.webroot), args.entrypoint) entrypoint = os.path.join(os.path.abspath(args.webroot), args.entrypoint)
targets, anchors, usedimages = process_all(files) targets, anchors, usedimages, projectlinks = process_all(files, args.project_uri)
fail = False fail = False
@ -283,6 +314,9 @@ else:
if check_images(usedimages, imagefiles, args.ignore_images): if check_images(usedimages, imagefiles, args.ignore_images):
fail = True fail = True
if check_projectlinks(projectlinks, args.project_uri_exceptions):
fail = True
if args.require_https: if args.require_https:
if check_https(externallinks): if check_https(externallinks):
fail = True fail = True