The PyPI (Python Package Index) download statistics are difficult to obtain: no official stats are provided, and they can only be obtained using Google BigQuery (which requires a paid account for extensive use).
Some technics were published for extracting data for a single module (for example here). I've managed to extract the data for all modules. In addition, I filtered out downloads performed by the most common mirroring tools (such as bandersnatch). The results are thus more accurate. The entire dataset can be downloaded in CSV format: pymod.csv
It includes per-month download stats for all Python modules (more than 100,000), since may 2016. I will try to maintain the dataset up-to-date.
Here are the download statistics of my Python modules:
Module |
Donwloads |
---|---|
Owlready |
|
FullPy |
|
Pymedtermino |
|
Bibreview |
|
Editobj |
|
Soya3 |
|
Cerealizer |
|
ORMithorynque |
|
eClaircie |
Here is my script. Beware, before running the script, you need to create an account for Google Query, to modify the GOOGLE_APPLICATION_CREDENTIALS variable to the JSON credential file, and to modify the CACHE variable to the desired filename. In addition, the amount of processed queries is limited per-day on Google BigQuery, thus do not expect to produce the whole dataset in a single attempt -- you need about 4-5 days for that.
import sys, os, glob, csv, sqlite3, datetime, json from collections import defaultdict from io import StringIO import googleapiclient, googleapiclient.discovery, googleapiclient.errors import oauth2client.client GOOGLE_APPLICATION_CREDENTIALS = "/home/jiba/src/My Project-daaf416e4853.json" CACHE = "/home/jiba/tmp/pymod.csv" os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = GOOGLE_APPLICATION_CREDENTIALS CACHE_DB = CACHE[:-3] + "sqlite3" def load(): if not os.path.exists(CACHE): return {} f = open(CACHE) c = csv.reader(f) xxx, *dates, xxx = next(c) data = { date : defaultdict(int) for date in dates } for row in c: module, *nbs, total = row for date, nb in zip(dates, nbs): data[date][module] = int(nb) f.close() return data def save(data): projets = sorted({ projet for data_date in data.values() for projet in data_date.keys() }) dates = sorted(data.keys()) table = [] for projet in projets: nbs = [data[date][projet] for date in dates] table.append([projet] + nbs + [sum(nbs)]) b = StringIO() c = csv.writer(b) c.writerow (["Module name"] + dates + ["Total"]) c.writerows(table) f = open(os.path.join(CACHE), "w") f.write(b.getvalue()) f.close() try: os.unlink(CACHE_DB) except: pass db = sqlite3.connect(CACHE_DB) sql = db.cursor() table = [] last_date = max(dates) for projet in projets: nbs = [data[date][projet] for date in dates] table.append([projet, data[last_date][projet], sum(nbs)]) sql.execute("""CREATE TABLE download_stats (module TEXT, last_month INTEGER, total INTEGER)""") sql.executemany("""INSERT INTO download_stats VALUES (?, ?, ?)""", table) db.commit() def add_month(data, date, s): if '"' in s: s = s.split('"')[1] data_date = data[date] = defaultdict(int) for ligne in csv.reader(StringIO(s), delimiter = "\t"): data_date[ligne[0]] = int(ligne[1]) def previous_month(d): if d.month == 1: return datetime.date(d.year - 1, 12, 1) return datetime.date(d.year, d.month - 1, 1) def last_month_day(d): day = 28 while True: try: d2 = datetime.date(d.year, d.month, day) except ValueError: return d day += 1 d = d2 def update(data): dates = set(data.keys()) d = previous_month(datetime.date.today()) stop_at = datetime.date(2016, 4, 1) while d > stop_at: date = "%s_%02i" % (d.year, d.month) if not date in dates: update_month(data, d) d = previous_month(d) class BigQuery(object): def __init__(self): self.credentials = oauth2client.client.GoogleCredentials.get_application_default() f = open(GOOGLE_APPLICATION_CREDENTIALS, "r") credentials_data = json.load(f) f.close() self.project_id = credentials_data["project_id"] self.bigquery = googleapiclient.discovery.build("bigquery", "v2", credentials = self.credentials) self.jobs = self.bigquery.jobs() def execute(self, query): response = self.jobs.query(projectId = self.project_id, body = { "query" : query } ).execute() print(response) print(response["jobComplete"]) if (not response["jobComplete"]): raise RuntimeError return response["rows"] BIG_QUERY = None def update_month(data, d): global BIG_QUERY if BIG_QUERY is None: BIG_QUERY = BigQuery() print("Querying for month:", d) query = """SELECT GROUP_CONCAT(x, '\n') FROM ( SELECT CONCAT(file.project, '\t', STRING(count(*))) AS x FROM TABLE_DATE_RANGE( [the-psf:pypi.downloads], TIMESTAMP('%s-%02i-01 00:00:00'), TIMESTAMP('%s-%02i-%02i 23:59:59') ) WHERE (details.installer.name != "bandersnatch") AND (details.installer.name != "z3c.pypimirror") GROUP BY file.project )""" % (d.year, d.month, d.year, d.month, last_month_day(d).day) print(query) rows = BIG_QUERY.execute(query) s = rows[0]["f"][0]["v"] f = open("/home/jiba/tmp/pymod_%s_%02i.csv" % (d.year, d.month), "w") f.write(s) f.close() add_month(data, "%s_%02i" % (d.year, d.month), s) save(data) data = load() update(data) save(data)