From d1548e504937d6239bf35938c8213581f97046ae Mon Sep 17 00:00:00 2001
From: Ian Wienand <iwienand@redhat.com>
Date: Mon, 3 Apr 2023 14:11:58 +1000
Subject: [PATCH] tools/upstream-wheel-audit.py

This is a tool to tell us which of our on-disk wheels are duplicated
upstream by PyPI.  These are things we don't need to cache locally.

At one time, we were downloading all dependencies of our requirements
and caching them; we shouldn't be doing that any more, but anything
reported by this tool can be removed from our local mirrors.

Now that the number of platforms * number of branches is becoming a
maintence issue, this will help us foucs on keeping a useful working
set in the cache.

Change-Id: I3ded6b9869598a0907d7cda9f03bf414e46885df
---
 tools/upstream-wheel-audit.py | 85 +++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 tools/upstream-wheel-audit.py

diff --git a/tools/upstream-wheel-audit.py b/tools/upstream-wheel-audit.py
new file mode 100644
index 0000000000..aff350c8d0
--- /dev/null
+++ b/tools/upstream-wheel-audit.py
@@ -0,0 +1,85 @@
+# Check which of the wheels in our AFS directory exist upstream
+#
+# This outputs two files
+#
+#  to-delete.txt : a list of files and directories that can be removed
+#                  from the mirror as all contents are cached in pypi
+#
+#  log.txt       : the leading number is the number of files left
+#                  in the given directory after checking upstream
+#                  package contents.  i.e. this is unique content in
+#                  our  mirror volume.
+#
+# Needs pypi-simple
+
+import sys
+import os
+import json
+
+from pypi_simple import PyPISimple, NoSuchProjectError
+
+BASE = '/afs/openstack.org/mirror/wheel'
+
+FILE_DEL = open('to-delete.txt', 'w')
+FILE_LOG = open('log.txt', 'w')
+
+PLATFORMS = ('centos-8-x86_64',
+             'centos-9-x86_64',
+             'debian-10-x86_64'
+             'debian-11-x86_64',
+             'ubuntu-18.04-aarch64',
+             'ubuntu-20.04-aarch64',
+             'ubuntu-22.04-aarch64',
+             'centos-8-aarch64',
+             'centos-9-aarch64',
+             'debian-10-aarch64',
+             'debian-11-aarch64',
+             'ubuntu-16.04-x86_64',
+             'ubuntu-18.04-x86_64',
+             'ubuntu-20.04-x86_64',
+             'ubuntu-22.04-x86_64')
+
+def iterate_wheels(path, d):
+    name = os.path.basename(path)
+
+    if os.path.isdir(path):
+        if name not in d['dirs']:
+            d['dirs'][name] = {'dirs':{},'files':[]}
+        for x in os.listdir(path):
+            iterate_wheels(os.path.join(path,x), d['dirs'][name])
+
+        # top level has index.html; skip
+        # otherwise the directory name is the pypi project name
+        if name and (d['dirs'][name]['files']):
+            with PyPISimple() as client:
+                try:
+                    page = client.get_project_page(name)
+                except NoSuchProjectError:
+                    print("Removing disappeared project : %s" % name, file=sys.stderr)
+                    for w in d['dirs'][name]['files']:
+                        print("%s/%s" % (path, w), file=FILE_DEL)
+                    return
+
+                upstream = set([package.filename for package in page.packages])
+                local = set(d['dirs'][name]['files'])
+
+                not_upstream = local.difference(upstream)
+                dups = local.intersection(upstream)
+
+                # Print files to delete, and if the directory is empty
+                # put that in the list to delete too.
+                for d in dups:
+                    print("%s/%s" % (path, d), file=FILE_DEL)
+                if len(not_upstream) == 0:
+                    print("%s" % path, file=FILE_DEL)
+
+                # Output the file left in the directory after pruning
+                print("%4d %s" % (len(not_upstream), path), file=FILE_LOG)
+    else:
+        d['files'].append(name)
+    return d
+
+for p in PLATFORMS:
+    print("Processing %s" % p, file=sys.stderr)
+    iterate_wheels('%s/%s/' % (BASE, p),
+                   d = {'dirs':{},'files':[]})