scripts: check-html-refernces: Add checking for image file usage

Check both that a file is referenced from our pages and also that pages reference existing images. The mode for dumping external references now also dumps images. '--ignore-image' can be used repeatedly to suppress errors for specific images. Signed-off-by: Peter Krempa <pkrempa@redhat.com> Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
2024-12-22 13:45:38 +00:00 · 2023-02-14 14:38:40 +01:00 · 2023-02-14 14:38:40 +01:00 · edac6ca756
commit edac6ca756
parent 0e3970bf9b
1 changed files with 87 additions and 14 deletions
--- a/scripts/check-html-references.py
+++ b/scripts/check-html-references.py
@ -24,25 +24,32 @@ import xml.etree.ElementTree as ET

 ns = {'html': 'http://www.w3.org/1999/xhtml'}
 externallinks = []
+externalimages = []


 def get_file_list(prefix):
    filelist = []
+    imagelist = []
+    imageformats = ['.jpg', '.svg', '.png']

    for root, dir, files in os.walk(prefix):
        for file in files:
-            if not re.search('\\.html$', file):
-                continue
+            ext = os.path.splitext(file)[1]

-            # the 404 page doesn't play well
-            if '404.html' in file:
-                continue
+            if ext == '.html':
+                # the 404 page doesn't play well
+                if '404.html' in file:
+                    continue

-            filelist.append(os.path.join(root, file))
+                filelist.append(os.path.join(root, file))
+
+            elif ext in imageformats:
+                imagelist.append(os.path.join(root, file))

    filelist.sort()
+    imagelist.sort()

-    return filelist
+    return filelist, imagelist


 # loads an XHTML and extracts all anchors, local and remote links for the one file
@ -50,12 +57,14 @@ def process_file(filename):
    tree = ET.parse(filename)
    root = tree.getroot()
    docname = root.get('data-sourcedoc')
+    dirname = os.path.dirname(filename)

    if not docname:
        docname = filename

    anchors = [filename]
    targets = []
+    images = []

    for elem in root.findall('.//html:a', ns):
        target = elem.get('href')
@ -68,7 +77,6 @@ def process_file(filename):
            if re.search('://', target):
                externallinks.append(target)
            elif target[0] != '#' and 'mailto:' not in target:
-                dirname = os.path.dirname(filename)
                targetfull = os.path.normpath(os.path.join(dirname, target))

                targets.append((filename, docname, targetfull, target))
@ -87,20 +95,33 @@ def process_file(filename):
        if an:
            anchors.append(filename + '#' + an)

-    return (anchors, targets)
+    # find local images
+    for elem in root.findall('.//html:img', ns):
+        src = elem.get('src')
+
+        if src:
+            if re.search('://', src):
+                externalimages.append(src)
+            else:
+                imagefull = os.path.normpath(os.path.join(dirname, src))
+                images.append((imagefull, docname))
+
+    return (anchors, targets, images)


 def process_all(filelist):
    anchors = []
    targets = []
+    images = []

    for file in filelist:
-        anchor, target = process_file(file)
+        anchor, target, image = process_file(file)

        targets = targets + target
        anchors = anchors + anchor
+        images = images + image

-    return (targets, anchors)
+    return (targets, anchors, images)


 def check_targets(targets, anchors):
@ -163,6 +184,46 @@ def check_usage(targets, files, entrypoint):
    return fail


+# checks that images present in the directory are being used and also that
+# pages link to existing images. For favicons, which are not referenced from
+# the '.html' files there's a builtin set of exceptions.
+def check_images(usedimages, imagefiles, ignoreimages):
+    favicons = [
+        'android-chrome-192x192.png',
+        'android-chrome-256x256.png',
+        'apple-touch-icon.png',
+        'favicon-16x16.png',
+        'favicon-32x32.png',
+        'mstile-150x150.png',
+    ]
+    fail = False
+
+    if ignoreimages:
+        favicons = favicons + ignoreimages
+
+    for usedimage, docname in usedimages:
+        if usedimage not in imagefiles:
+            print(f'ERROR: \'{docname}\' references image \'{usedimage}\' not among images')
+            fail = True
+
+    for imagefile in imagefiles:
+        used = False
+
+        if imagefile in (usedimage[0] for usedimage in usedimages):
+            used = True
+        else:
+            for favicon in favicons:
+                if favicon in imagefile:
+                    used = True
+                    break
+
+        if not used:
+            print(f'ERROR: Image \'{imagefile}\' is not used by any page')
+            fail = True
+
+    return fail
+
+
 parser = argparse.ArgumentParser(description='HTML reference checker')
 parser.add_argument('--webroot', required=True,
                    help='path to the web root')
@ -170,14 +231,16 @@ parser.add_argument('--entrypoint', default="index.html",
                    help='file name of web entry point relative to --webroot')
 parser.add_argument('--external', action="store_true",
                    help='print external references instead')
+parser.add_argument('--ignore-images', action='append',
+                    help='paths to images that should be considered as used')

 args = parser.parse_args()

-files = get_file_list(os.path.abspath(args.webroot))
+files, imagefiles = get_file_list(os.path.abspath(args.webroot))

 entrypoint = os.path.join(os.path.abspath(args.webroot), args.entrypoint)

-targets, anchors = process_all(files)
+targets, anchors, usedimages = process_all(files)

 fail = False

@ -186,7 +249,14 @@ if args.external:
    externallinks.sort()
    for ext in externallinks:
        if ext != prev:
-            print(ext)
+            print(f'link: {ext}')
+
+        prev = ext
+
+    externalimages.sort()
+    for ext in externalimages:
+        if ext != prev:
+            print(f'image: {ext}')

        prev = ext
 else:
@ -196,6 +266,9 @@ else:
    if check_usage(targets, files, entrypoint):
        fail = True

+    if check_images(usedimages, imagefiles, args.ignore_images):
+        fail = True
+
    if fail:
        sys.exit(1)