''' This code is hacky at best I declare this code to be in the public domain ''' import re import numpy as np import matplotlib.pyplot as plt # Get the project ID numbers of semifinalists, cached below if 0: s = re.compile("<li><a href=\"http://hackaday.io/project/(.*?)\" target=\"_") fn = r"50 Semifinalists Selected for Next Stage of The Hackaday Prize.htm" data = open(fn,"rb") dat = [] for line in data: vals = s.findall(line) if len(vals) and len(vals[0])>0: dat.append(int(vals[0])) print dat print len(dat) finalists = dat semi_finalists = [963, 1864, 1962, 1797, 2759, 2736, 1080, 1233, 956, 2017, 1922, 1279, 2552, 2214, 1662, 1230, 1389, 2443, 1741, 205, 2117, 1594, 1552, 2418, 1156, 249, 2678, 1569, 1395, 1129, 1313, 2263, 1839, 1538, 1214, 1376, 1460, 1605, 1340, 1682, 1981, 1677, 770, 2197, 1348, 812, 1974, 1437, 2264, 2040] sf = np.array(semi_finalists) sf.sort() # Terrible quick searches to get the data. you could also use a proper html parser but this is easy searches = [re.compile("title=\"Comments\">(.*?)</span>"), re.compile("title=\"Followers\">(.*?)</span>"), re.compile("title=\"Skulls\">(.*?)</span>"), re.compile("<h3><a href=\"http://hackaday.io/project/([0123456789]*?)-.*?\" title=")] # Saved webpage from the 23rd or so of all entries. changed unicode dot symbol to a - fn = r"Official Hackaday Prize Entry - Hackaday Projects.htm" data = open(fn,"rb") # ^^^^ download your own data and point the string above at it dat = [[],[],[],[]] for line in data: for n,s in enumerate(searches): vals = s.findall(line) if len(vals) and len(vals[0])>0: dat[n].append(int(vals[0])) # The list of valid entries from the other unofficial statistics project valid = np.array([1007, 1011, 1033, 1039, 1052, 1058, 1071, 1075, 1080, 1085, 1087, 1092, 1104, 1109, 1115, 1120, 1125, 1129, 1148, 1150, 1156, 1162, 1164, 1172, 1181, 1183, 1188, 1205, 1214, 1219, 1220, 1222, 1230, 1231, 1233, 1239, 1254, 1261, 1279, 1306, 1313, 1333, 1336, 1340, 1347, 1348, 1351, 1353, 1368, 137, 1373, 1376, 1379, 1386, 1389, 1393, 1395, 1408, 1416, 1436, 1437, 1439, 1460, 1470, 1471, 1476, 1477, 1484, 1490, 1498, 1500, 151, 1519, 1524, 1528, 1529, 1531, 1533, 1534, 1538, 1539, 1543, 1544, 1546, 1549, 1552, 1555, 1559, 1569, 1577, 158, 1590, 1594, 1605, 1607, 1608, 1611, 1615, 1619, 1620, 1637, 1640, 1644, 1656, 1662, 1664, 1670, 1677, 1678, 1682, 1712, 1720, 1726, 1730, 1734, 1735, 1740, 1741, 1742, 1743, 1744, 1745, 1755, 1762, 1764, 1765, 1774, 1783, 1784, 1789, 1797, 181, 1812, 1815, 1833, 1838, 1839, 1845, 1852, 1853, 1860, 1862, 1864, 1873, 1877, 1878, 1887, 1888, 1890, 1892, 1895, 1903, 1911, 1915, 1917, 1922, 1925, 1927, 1935, 1936, 1937, 1939, 1943, 1948, 1952, 1962, 1964, 1965, 1972, 1974, 1981, 1986, 1990, 1991, 1994, 1996, 1998, 2000, 2009, 2010, 2011, 2013, 2015, 2017, 2021, 2025, 2026, 2028, 2030, 2037, 2040, 205, 2056, 2058, 2065, 2068, 2076, 2090, 2106, 2114, 2117, 2121, 2124, 2136, 2137, 2139, 214, 2142, 2144, 2150, 2151, 2155, 2157, 2158, 2160, 2164, 2169, 2170, 2172, 2173, 2174, 2179, 2185, 2189, 2190, 2194, 2197, 2198, 2200, 2203, 2206, 2208, 2211, 2214, 2215, 2216, 2218, 2225, 2226, 2231, 2233, 2236, 2249, 2250, 2252, 2260, 2263, 2264, 2265, 2269, 2279, 2280, 2281, 2284, 2286, 2288, 2291, 2292, 2298, 2307, 2308, 2309, 2310, 2311, 2312, 2318, 2333, 2336, 2338, 2343, 2346, 2347, 2355, 2363, 2371, 2372, 2373, 2375, 2376, 2384, 2385, 2386, 2387, 2388, 239, 2392, 2396, 2403, 2404, 2413, 2414, 2415, 2416, 2418, 2422, 2423, 2424, 2429, 2431, 2434, 2435, 2436, 2440, 2442, 2443, 2455, 2457, 2461, 2465, 2468, 2470, 2471, 2473, 2474, 2475, 2478, 2479, 2483, 2484, 2489, 249, 2491, 2492, 2494, 2497, 2500, 2503, 2504, 2507, 2512, 2517, 2519, 2521, 2522, 2524, 2528, 2530, 2532, 2533, 2534, 2543, 2548, 2549, 2550, 2552, 2555, 2557, 2558, 2560, 2562, 2566, 2567, 2568, 2570, 2572, 2574, 2577, 2579, 2581, 2585, 2589, 2590, 2591, 2594, 2598, 2601, 2602, 2604, 2607, 2610, 2611, 2612, 2614, 2615, 2616, 2617, 2618, 2619, 2624, 2626, 2633, 2634, 2639, 2640, 2644, 2655, 2656, 2673, 2674, 2676, 2678, 2679, 2683, 2693, 2697, 270, 2702, 2703, 2705, 2708, 2711, 2714, 2724, 2727, 2731, 2736, 2737, 2748, 2759, 2760, 2761, 2769, 2774, 2782, 2784, 2786, 2804, 2813, 2818, 2823, 2827, 294, 456, 570, 587, 604, 675, 711, 770, 812, 852, 87, 930, 934, 945, 956, 957, 962, 963, 964, 967, 970, 983, 986, 996]) valid = np.sort(valid) # Numpy array of the data dat = np.array(dat) idx = np.argsort(dat[3]) # sort by id# dat = dat[:,idx] # sort all data by id# # Grab the parts and name them nicely for easy use num_comments = dat[0] followers = dat[1] skulls = dat[2] ids = dat[3] highest_id = max(ids) reverse_idx = np.zeros(highest_id+1, dtype = np.int_) reverse_idx[ids] = np.arange(0,highest_id,1) print reverse_idx assert 5 == reverse_idx[ids[5]] # check that the reverse works # Get entry numbers of semifinalists and valid entries sf_nums = reverse_idx[sf] valid_nums = reverse_idx[valid] # plots it for ds,ds_name in ((num_comments, "Comments"), (followers, "Followers"), (skulls, "Skulls"), (num_comments+followers+skulls,"total")): fig = plt.figure() fig.suptitle('HAD Prize semifinalist statistics by '+ds_name, fontsize=14, fontweight='bold') ax = fig.add_subplot(111) fig.subplots_adjust(top=0.85) #ax.set_title('Data!') ax.set_xlabel('Entry order number') ax.set_ylabel('Number of '+ds_name) #ax.set_yscale('log') plt.scatter(reverse_idx[ids], ds, label = 'Entry') plt.scatter(reverse_idx[ids][valid_nums], ds[valid_nums], label = 'Valid', color = 'g') plt.scatter(reverse_idx[ids][sf_nums], ds[sf_nums], color = 'r', label = 'Semifinalist') plt.legend() plt.show()