Skip to content

Instantly share code, notes, and snippets.

@SamPenrose
Created September 4, 2015 23:47
Show Gist options
  • Save SamPenrose/584e9f9e65022b7fa847 to your computer and use it in GitHub Desktop.
Save SamPenrose/584e9f9e65022b7fa847 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{"nbformat_minor": 0, "cells": [{"source": "This notebook records an experiment with using Accumulators, rather than\nexplicitly returned values, to tally FHR statistics.\n\nThe notebook does NOT establish that the approach is a good idea, or even\nthat it performs better.\n\nThe code is WIP for a v2/v4 comparison focused on 41 beta.", "cell_type": "markdown", "metadata": {}}, {"execution_count": 1, "cell_type": "code", "source": "v2_path = 's3n://mozillametricsfhrsamples/beta/part-r-00001'\nv2_file = sc.sequenceFile(v2_path)", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 12, "cell_type": "code", "source": "import ujson as json\nfrom numbers import Number\nV2_CRASH_KEYS = set(['main-crash', 'main-hang', 'plugin-crash', 'plugin-hang'])\nWINDOW_START = '2015-08-08'\nWINDOW_END = '2015-08-30'\nAN_HOUR = 3600.0\nCHANNEL = 'beta'\nVERSION = '41.0'\n\ndef make_walker():\n counters = {\n 'beta_41': sc.accumulator(0),\n 'crashes': sc.accumulator(0),\n 'yahoo': sc.accumulator(0),\n 'hours': sc.accumulator(0.0),\n 'default': sc.accumulator(0),\n 'parsing_errors': sc.accumulator(0)\n }\n\n def descend(d):\n '''\n Inspect a $data$days blob.\n '''\n searches = d.get('org.mozilla.searches.counts', {})\n for k in searches:\n if 'yahoo' in k.lower():\n value = searches[k]\n if isinstance(value, Number):\n counters['yahoo'].add(value)\n crash_dict = d.get('org.mozilla.crashes.crashes', {})\n crashes = sum([v for k, v in crash_dict.items() if k in V2_CRASH_KEYS])\n if crashes:\n counters['crashes'].add(crashes)\n if d.get('org.mozilla.appInfo.appinfo', {}).get('isDefaultBrowser', 0):\n counters['default'].add(1)\n seconds = sum(d.get('org.mozilla.appSessions.previous', {}).get(\n 'cleanTotalTime', [0]))\n if seconds > 0:\n hours = seconds/AN_HOUR\n counters['hours'].add(hours) \n \n def walk(tup):\n '''\n Process a single value from the v2 sequenceFile.\n '''\n try:\n cid, ping = tup\n d = json.loads(ping)\n # I suspect I should be checking each day-blob.\n info = d.get('geckoAppInfo')\n if info:\n c = info.get('updateChannel')\n v = info.get('version')\n if (c, v) != (CHANNEL, VERSION):\n return\n else:\n return\n days = d.get('data', {}).get('days', {})\n in_window = []\n for date, blob in days.items():\n if date < WINDOW_START:\n continue\n if date > WINDOW_END:\n continue\n in_window.append(blob)\n if not in_window:\n return\n counters['beta_41'].add(1)\n for day in in_window:\n descend(day)\n except Exception:\n counters['parsing_errors'].add(1)\n return walk, counters", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 13, "cell_type": "code", "source": "walk, counters = make_walker()", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 14, "cell_type": "code", "source": "run = v2_file.map(walk).collect()\nfor name, obj in counters.items():\n print name, obj.value", "outputs": [{"output_type": "stream", "name": "stdout", "text": "crashes 11052\ndefault 79038\nyahoo 9901\nhours 683967.951667\nparsing_errors 0\nbeta_41 12736\n"}], "metadata": {"collapsed": false, "trusted": true}}], "nbformat": 4, "metadata": {"kernelspec": {"display_name": "Python 2", "name": "python2", "language": "python"}, "language_info": {"mimetype": "text/x-python", "nbconvert_exporter": "python", "version": "2.7.9", "name": "python", "file_extension": ".py", "pygments_lexer": "ipython2", "codemirror_mode": {"version": 2, "name": "ipython"}}}}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment