forked from MapComplete/MapComplete
		
	
		
			
				
	
	
		
			393 lines
		
	
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			393 lines
		
	
	
	
		
			14 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import csv
 | |
| import string
 | |
| from datetime import datetime
 | |
| 
 | |
| from matplotlib import pyplot
 | |
| import re
 | |
| 
 | |
| 
 | |
| def counts(lst):
 | |
|     counts = {}
 | |
|     for v in lst:
 | |
|         if not v in counts:
 | |
|             counts[v] = 0
 | |
|         counts[v] += 1
 | |
|     return counts
 | |
| 
 | |
| 
 | |
| class Hist:
 | |
| 
 | |
|     def __init__(self, firstcolumn):
 | |
|         self.key = "\"" + firstcolumn + "\""
 | |
|         self.dictionary = {}
 | |
|         self.key = ""
 | |
| 
 | |
|     def add(self, key, value):
 | |
|         if not key in self.dictionary:
 | |
|             self.dictionary[key] = []
 | |
|         self.dictionary[key].append(value)
 | |
| 
 | |
|     def values(self):
 | |
|         allV = []
 | |
|         for v in self.dictionary.values():
 | |
|             allV += list(set(v))
 | |
|         return list(set(allV))
 | |
|     
 | |
|     def keys(self):
 | |
|         return self.dictionary.keys()
 | |
| 
 | |
|     def get(self, key):
 | |
|         if key in self.dictionary:
 | |
|             return self.dictionary[key]
 | |
|         return None
 | |
| 
 | |
|     # Returns (keys, values.map(f)). To be used with e.g. pyplot.plot
 | |
|     def map(self, f):
 | |
|         vals = []
 | |
|         keys = self.keys()
 | |
|         for key in keys:
 | |
|             vals.append(f(self.get(key)))
 | |
|         return vals
 | |
| 
 | |
|     def mapcumul(self, f, add, zero):
 | |
|         vals = []
 | |
|         running_value = zero
 | |
|         keys = self.keys()
 | |
|         for key in keys:
 | |
|             v = f(self.get(key))
 | |
|             running_value = add(running_value, v)
 | |
|             vals.append(running_value)
 | |
|         return vals
 | |
| 
 | |
|     # Returns [(key, flatten(values))]
 | |
|     def flatten(self, flatten):
 | |
|         result = []
 | |
|         keys = self.keys()
 | |
|         for key in keys:
 | |
|             v = flatten(self.get(key))
 | |
|             result.append((key, v))
 | |
|         return result
 | |
| 
 | |
|     def csv(self):
 | |
|         csv = self.key + "," + ",".join(self.values())
 | |
|         header = self.values()
 | |
|         for k in self.dictionary.keys():
 | |
|             csv += k
 | |
|             values = counts(self.dictionary[k])
 | |
|             for head in header:
 | |
|                 if head in values:
 | |
|                     csv += "," + str(values[head])
 | |
|                 else:
 | |
|                     csv += ",0"
 | |
|             csv += "\n"
 | |
|         return csv
 | |
| 
 | |
|     def __str__(self):
 | |
|         return str(self.dictionary)
 | |
| 
 | |
| 
 | |
| def build_hist(stats, keyIndex, valueIndex):
 | |
|     hist = Hist("date")
 | |
|     c = 0
 | |
|     for row in stats:
 | |
|         c += 1
 | |
|         hist.add(row[keyIndex], row[valueIndex])
 | |
|     return hist
 | |
| 
 | |
| 
 | |
| def as_date(str):
 | |
|     return datetime.strptime(str, "%Y-%m-%d")
 | |
| 
 | |
| 
 | |
| def cumulative_users(stats):
 | |
|     users_hist = build_hist(stats, 0, 1)
 | |
|     all_users_per_day = users_hist.mapcumul(
 | |
|         lambda users: set(users),
 | |
|         lambda a, b: a.union(b),
 | |
|         set([])
 | |
|     )
 | |
|     cumul_uniq = list(map(len, all_users_per_day))
 | |
|     unique_per_day = users_hist.map(lambda users: len(set(users)))
 | |
|     new_users = [0]
 | |
|     for i in range(len(cumul_uniq) - 1):
 | |
|         new_users.append(cumul_uniq[i + 1] - cumul_uniq[i])
 | |
|     dates = map(as_date, users_hist.keys())
 | |
|     return list(dates), cumul_uniq, list(unique_per_day), list(new_users)
 | |
| 
 | |
| 
 | |
| def pyplot_init():
 | |
|     pyplot.figure(figsize=(14, 8), dpi=200)
 | |
|     pyplot.xticks(rotation='vertical')
 | |
| 
 | |
| 
 | |
| def create_usercount_graphs(stats, extra_text=""):
 | |
|     print("Creating usercount graphs " + extra_text)
 | |
|     dates, cumul_uniq, unique_per_day, new_users = cumulative_users(stats)
 | |
|     total = cumul_uniq[-1]
 | |
| 
 | |
|     pyplot_init()
 | |
|     pyplot.bar(dates, unique_per_day, label='Unique contributors')
 | |
|     pyplot.bar(dates, new_users, label='First time contributor via MapComplete')
 | |
|     pyplot.legend()
 | |
|     pyplot.title("Unique contributors" + extra_text + ' with MapComplete (' + str(total) + ' contributors)')
 | |
|     pyplot.ylabel("Number of unique contributors")
 | |
|     pyplot.xlabel("Date")
 | |
|     pyplot.savefig("Contributors" + extra_text + ".png", dpi=400, facecolor='w', edgecolor='w')
 | |
| 
 | |
|     pyplot_init()
 | |
|     pyplot.plot(dates, cumul_uniq, label='Cumulative unique contributors')
 | |
|     pyplot.legend()
 | |
|     pyplot.title("Cumulative unique contributors" + extra_text + " with MapComplete - " + str(total) + " contributors")
 | |
|     pyplot.ylabel("Number of unique contributors")
 | |
|     pyplot.xlabel("Date")
 | |
|     pyplot.savefig("CumulativeContributors" + extra_text + ".png", dpi=400, facecolor='w', edgecolor='w')
 | |
| 
 | |
| 
 | |
| def create_theme_breakdown(stats, fileExtra="", cutoff=5):
 | |
|     print("Creating theme breakdown " + fileExtra)
 | |
|     themeCounts = {}
 | |
|     for row in stats:
 | |
|         theme = row[3].lower()
 | |
|         if theme in theme_remappings:
 | |
|             theme = theme_remappings[theme]
 | |
|         if theme in themeCounts:
 | |
|             themeCounts[theme] += 1
 | |
|         else:
 | |
|             themeCounts[theme] = 1
 | |
|     themes = list(themeCounts.items())
 | |
|     if len(themes) == 0:
 | |
|         print("No entries found for theme breakdown (extra: " + str(fileExtra) + ")")
 | |
|         return
 | |
|     themes.sort(key=lambda kv: kv[1], reverse=True)
 | |
|     other_count = sum([theme[1] for theme in themes if theme[1] < cutoff])
 | |
|     themes_filtered = [theme for theme in themes if theme[1] >= cutoff]
 | |
|     keys = list(map(lambda kv: kv[0] + " (" + str(kv[1]) + ")", themes_filtered))
 | |
|     values = list(map(lambda kv: kv[1], themes_filtered))
 | |
|     total = sum(map(lambda kv: kv[1], themes))
 | |
|     first_pct = themes[0][1] / total;
 | |
|     if other_count > 0:
 | |
|         keys.append("other")
 | |
|         values.append(other_count)
 | |
|     pyplot_init()
 | |
|     pyplot.pie(values, labels=keys, startangle=(90 - 360 * first_pct / 2))
 | |
|     pyplot.title("MapComplete changes per theme" + fileExtra + " - " + str(total) + " total changes")
 | |
|     pyplot.savefig("Theme distribution" + fileExtra + ".png", dpi=400, facecolor='w', edgecolor='w',
 | |
|                    bbox_inches='tight')
 | |
|     return themes
 | |
| 
 | |
| def summed_changes_per(contents, extraText, sum_column=5):
 | |
|     newPerDay = build_hist(contents, 0, 5)
 | |
|     kv = newPerDay.flatten(sum)
 | |
|     keysNew = list(map(lambda kv: as_date(kv[0]), kv))
 | |
|     valuesNew = list(map(lambda kv: kv[1], kv))
 | |
|     changedPerDay = build_hist(contents, 0, 6)
 | |
|     kv = changedPerDay.flatten(sum)
 | |
|     keysChanged = list(map(lambda kv: as_date(kv[0]), kv))
 | |
|     valuesChanged = list(map(lambda kv: kv[1], kv))
 | |
|     if len(keysChanged) == 0 and len(keysNew) == 0:
 | |
|         return
 | |
| 
 | |
|     pyplot_init()
 | |
|     text = "New and changed nodes per day "+extraText
 | |
|     pyplot.title(text)
 | |
|     if len(keysChanged) > 0:
 | |
|         pyplot.bar(keysChanged, valuesChanged, label="Changed")
 | |
|     if len(keysNew) > 0:
 | |
|         pyplot.bar(keysNew, valuesNew, label="New")
 | |
|     pyplot.legend()
 | |
|     pyplot.savefig(text)
 | |
| 
 | |
| def cumulative_changes_per(contents, index, subject, filenameextra="", cutoff=5, cumulative=True, sort=True):
 | |
|     print("Creating graph about " + subject + filenameextra)
 | |
|     themes = Hist("date")
 | |
|     dates_per_theme = Hist("theme")
 | |
|     all_themes = set()
 | |
|     for row in contents:
 | |
|         th = row[index]
 | |
|         all_themes.add(th)
 | |
|         themes.add(as_date(row[0]), th)
 | |
|         dates_per_theme.add(th, row[0])
 | |
|     per_theme_count = list(zip(dates_per_theme.keys(), dates_per_theme.map(len)))
 | |
|     # PerThemeCount gives the most popular theme first
 | |
|     if sort == True:
 | |
|         per_theme_count.sort(key=lambda kv: kv[1], reverse=False)
 | |
|     elif sort is not None:
 | |
|         per_theme_count.sort(key=sort)
 | |
|     values_to_show = []  # (theme name, value to fill between - this is stacked, with the first layer to print last)
 | |
|     running_totals = None
 | |
|     other_total = 0
 | |
|     other_theme_count = 0
 | |
|     other_cumul = None
 | |
| 
 | |
|     for kv in per_theme_count:
 | |
|         theme = kv[0]
 | |
|         total_for_this_theme = kv[1]
 | |
|         if cumulative:
 | |
|             edits_per_day_cumul = themes.mapcumul(
 | |
|                 lambda themes_for_date: len([x for x in themes_for_date if theme == x]),
 | |
|                 lambda a, b: a + b, 0)
 | |
|         else:
 | |
|             edits_per_day_cumul = themes.map(lambda themes_for_date: len([x for x in themes_for_date if theme == x]))
 | |
| 
 | |
|         if (not cumulative) or (running_totals is None):
 | |
|             running_totals = edits_per_day_cumul
 | |
|         else:
 | |
|             running_totals = list(map(lambda ab: ab[0] + ab[1], zip(running_totals, edits_per_day_cumul)))
 | |
| 
 | |
|         if total_for_this_theme >= cutoff:
 | |
|             values_to_show.append((theme, running_totals))
 | |
|         else:
 | |
|             other_total += total_for_this_theme
 | |
|             other_theme_count += 1
 | |
|             if other_cumul is None:
 | |
|                 other_cumul = edits_per_day_cumul
 | |
|             else:
 | |
|                 other_cumul = list(map(lambda ab: ab[0] + ab[1], zip(other_cumul, edits_per_day_cumul)))
 | |
| 
 | |
|     keys = list(themes.keys())
 | |
|     values_to_show.reverse()
 | |
|     values_to_show.append(("other", other_cumul))
 | |
|     totals = dict(per_theme_count)
 | |
|     total = sum(totals.values())
 | |
|     totals["other"] = other_total
 | |
| 
 | |
|     pyplot_init()
 | |
|     for kv in values_to_show:
 | |
|         if kv[1] is None:
 | |
|             continue  # No 'other' graph
 | |
|         msg = kv[0] + " (" + str(totals[kv[0]]) + ")"
 | |
|         if kv[0] == "other":
 | |
|             msg = str(other_theme_count) + " small " + subject + "s (" + str(other_total) + " changes)"
 | |
|         if cumulative:
 | |
|             pyplot.fill_between(keys, kv[1], label=msg)
 | |
|         else:
 | |
|             pyplot.plot(keys, kv[1], label=msg)
 | |
| 
 | |
|     if cumulative:
 | |
|         cumulative_txt = "Cumulative changesets"
 | |
|     else:
 | |
|         cumulative_txt = "Changesets"
 | |
|     pyplot.title(cumulative_txt + " per " + subject + filenameextra + " (" + str(total) + " changesets)")
 | |
|     pyplot.legend(loc="upper left", ncol=3)
 | |
|     pyplot.savefig(cumulative_txt + " per " + subject + filenameextra + ".png")
 | |
| 
 | |
| 
 | |
| def contents_where(contents, index, starts_with, invert=False):
 | |
|     for row in contents:
 | |
|         if row[index].startswith(starts_with) is not invert:
 | |
|             yield row
 | |
| 
 | |
| 
 | |
| def sortable_user_number(kv):
 | |
|     str = kv[0]
 | |
|     ls = list(map(lambda str : "0"+str if len(str) < 2 else str, re.findall("[0-9]+", str)))
 | |
|     return ".".join(ls)
 | |
| 
 | |
| 
 | |
| def create_graphs(contents):
 | |
|     summed_changes_per(contents, "")
 | |
|     cumulative_changes_per(contents, 4, "version number", cutoff=1, sort=sortable_user_number)
 | |
|     create_usercount_graphs(contents)
 | |
|     create_theme_breakdown(contents)
 | |
|     cumulative_changes_per(contents, 3, "created element", cutoff=10)
 | |
|     cumulative_changes_per(contents, 3, "theme", cutoff=10)
 | |
|     cumulative_changes_per(contents, 3, "theme", cutoff=10, cumulative=False)
 | |
|     cumulative_changes_per(contents, 1, "contributor", cutoff=15)
 | |
|     cumulative_changes_per(contents, 2, "language", cutoff=1)
 | |
|     cumulative_changes_per(contents, 8, "host", cutoff=1)
 | |
| 
 | |
|     currentYear = datetime.now().year
 | |
|     for year in range(2020, currentYear + 1):
 | |
|         contents_filtered = list(contents_where(contents, 0, str(year)))
 | |
|         extratext = " in " + str(year)
 | |
|         create_usercount_graphs(contents_filtered, extratext)
 | |
|         create_theme_breakdown(contents_filtered, extratext)
 | |
|         cumulative_changes_per(contents_filtered, 3, "theme", extratext, cutoff=5)
 | |
|         cumulative_changes_per(contents_filtered, 3, "theme", extratext, cutoff=5, cumulative=False)
 | |
|         cumulative_changes_per(contents_filtered, 1, "contributor", extratext, cutoff=10)
 | |
|         cumulative_changes_per(contents_filtered, 2, "language", extratext, cutoff=1)
 | |
|         cumulative_changes_per(contents_filtered, 4, "version number", extratext, cutoff=1, cumulative=False,
 | |
|                                sort=sortable_user_number)
 | |
|         cumulative_changes_per(contents_filtered, 4, "version number", extratext, cutoff=1, sort=sortable_user_number)
 | |
|         cumulative_changes_per(contents_filtered, 8, "host", extratext, cutoff=1)
 | |
|         summed_changes_per(contents_filtered, "for year "+str(year))
 | |
| 
 | |
| 
 | |
| 
 | |
| def create_per_theme_graphs(contents, cutoff=10):
 | |
|     all_themes = set(map(lambda row: row[3], contents))
 | |
|     for theme in all_themes:
 | |
|         filtered = list(contents_where(contents, 3, theme))
 | |
|         if len(filtered) < cutoff:
 | |
|             # less then 10 changesets - we do not map it
 | |
|             continue
 | |
|         contributors = set(map(lambda row: row[1], filtered))
 | |
|         if len(contributors) >= 2:
 | |
|             cumulative_changes_per(filtered, 1, "contributor", " for theme " + theme, cutoff=1)
 | |
|         if len(filtered) > 25:
 | |
|             summed_changes_per(filtered, "for theme "+theme)
 | |
| 
 | |
| 
 | |
| def create_per_contributor_graphs(contents, least_needed_changesets):
 | |
|     all_contributors = set(map(lambda row: row[1], contents))
 | |
|     for contrib in all_contributors:
 | |
|         filtered = list(contents_where(contents, 1, contrib))
 | |
|         if len(filtered) < least_needed_changesets:
 | |
|             print("Skipping "+contrib+" - too little changesets");
 | |
|             continue
 | |
|         themes = set(map(lambda row: row[3], filtered))
 | |
|         if len(themes) >= 2:
 | |
|             cumulative_changes_per(filtered, 3, "theme", " for contributor " + contrib, cutoff=1)
 | |
|         if len(filtered) > 25:
 | |
|             summed_changes_per(filtered, "for contributor "+contrib)
 | |
| 
 | |
| 
 | |
| theme_remappings = {
 | |
|     "metamap": "maps",
 | |
|     "groen": "buurtnatuur",
 | |
|     "updaten van metadata met mapcomplete": "buurtnatuur",
 | |
|     "Toevoegen of dit natuurreservaat toegangkelijk is":"buurtnatuur",
 | |
|     "wiki:mapcomplete/fritures": "fritures",
 | |
|     "wiki:MapComplete/Fritures": "fritures",
 | |
|     "lits": "lit",
 | |
|     "pomp": "cyclofix",
 | |
|     "wiki:user:joost_schouppe/campersite": "campersite",
 | |
|     "wiki-user-joost_schouppe-geveltuintjes": "geveltuintjes",
 | |
|     "wiki-user-joost_schouppe-campersite": "campersite",
 | |
|     "wiki-User-joost_schouppe-campersite": "campersite",
 | |
|     "wiki-User-joost_schouppe-geveltuintjes": "geveltuintjes",
 | |
|     "wiki:User:joost_schouppe/campersite": "campersite",
 | |
|     "https://raw.githubusercontent.com/osmbe/play/master/mapcomplete/geveltuinen/geveltuinen.json": "geveltuintjes"
 | |
| }
 | |
| 
 | |
| 
 | |
| def clean_input(contents):
 | |
|     for row in contents:
 | |
|         theme = row[3].strip().strip("\"").lower()
 | |
|         if theme == "null":
 | |
|             # The theme metadata has only been set later on - we fetch this from the comment
 | |
|             i = row[7].rfind("#")
 | |
|             theme = row[7][i + 1:-1].lower()
 | |
|         if theme in theme_remappings:
 | |
|             theme = theme_remappings[theme]
 | |
|         row[3] = theme
 | |
|         row[4] = row[4].strip().strip("\"")[len("MapComplete "):]
 | |
|         row[4] = re.findall("[0-9]*\.[0-9]*\.[0-9]*", row[4])[0]
 | |
|         row = [data.strip().strip("\"") for data in row]
 | |
|         row[5] = int(row[5])
 | |
|         row[6] = int(row[6])
 | |
|         yield row
 | |
| 
 | |
| 
 | |
| def main():
 | |
|     print("Creating graphs...")
 | |
|     with open('stats.csv', newline='') as csvfile:
 | |
|         stats = list(clean_input(csv.reader(csvfile, delimiter=',', quotechar='"')))
 | |
|         print("Found " + str(len(stats)) + " changesets")
 | |
|         create_graphs(stats)
 | |
|         create_per_theme_graphs(stats, 15)
 | |
|         # create_per_contributor_graphs(stats, 25)
 | |
|     print("All done!")
 | |
| 
 | |
| 
 | |
| main()
 |