Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save SamPenrose/9996eb52a2bd5160a6a8 to your computer and use it in GitHub Desktop.
Save SamPenrose/9996eb52a2bd5160a6a8 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{"nbformat_minor": 0, "cells": [{"execution_count": 2, "cell_type": "code", "source": "import urllib\nf = urllib.urlopen('https://raw.githubusercontent.com/SamPenrose/data-pipeline/01ba20acb5242a6e65ddcaad94e154d8f7c67cf7/schemas/test/sample_v4_ping.json')\nping = f.read()", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 4, "cell_type": "code", "source": "import ujson as json\nping = json.loads(ping)", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 5, "cell_type": "code", "source": "ping.keys()", "outputs": [{"execution_count": 5, "output_type": "execute_result", "data": {"text/plain": "[u'clientId',\n u'payload',\n u'environment',\n u'application',\n u'version',\n u'creationDate',\n u'type',\n u'id']"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 35, "cell_type": "code", "source": "# dissect_ping will break if paths isn't breadth-first\npaths = [\n 'application',\n 'environment',\n 'environment/addons',\n 'payload',\n 'payload/addonDetails',\n 'payload/childPayloads',\n 'payload/histograms',\n 'payload/keyedHistograms',\n 'payload/slowSQL'\n ]\npaths.sort()\n\ndef dissect_ping(d):\n result = {'root': d}\n for path in paths:\n target = d\n previous = d\n for name in path.split('/'):\n previous = target\n target = target.get(name, {})\n if previous and target:\n del previous[name]\n result[path] = target\n return result", "outputs": [], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 36, "cell_type": "code", "source": "parts = dissect_ping(ping)\nparts.keys()", "outputs": [{"execution_count": 36, "output_type": "execute_result", "data": {"text/plain": "['payload/histograms',\n 'payload/childPayloads',\n 'payload',\n 'environment',\n 'application',\n 'payload/slowSQL',\n 'payload/addonDetails',\n 'payload/keyedHistograms',\n 'root',\n 'environment/addons']"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 37, "cell_type": "code", "source": "parts['root']", "outputs": [{"execution_count": 37, "output_type": "execute_result", "data": {"text/plain": "{u'clientId': u'6fd3eb50-8bec-4b9c-8778-59406171312a',\n u'creationDate': u'2015-11-05T01:25:43.312Z',\n u'id': u'0fdac909-d2ec-454c-b625-261a3e5d5c9b',\n u'type': u'main',\n u'version': 4}"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 38, "cell_type": "code", "source": "sqlContext.jsonRDD(sc.parallelize([parts['root']]).map(lambda d: json.dumps(parts['root']))).printSchema()", "outputs": [{"output_type": "stream", "name": "stdout", "text": "root\n |-- clientId: string (nullable = true)\n |-- creationDate: string (nullable = true)\n |-- id: string (nullable = true)\n |-- type: string (nullable = true)\n |-- version: long (nullable = true)\n\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 39, "cell_type": "code", "source": "parts['application']", "outputs": [{"execution_count": 39, "output_type": "execute_result", "data": {"text/plain": "{u'architecture': u'x86-64',\n u'buildId': u'20151103030248',\n u'channel': u'nightly',\n u'name': u'Firefox',\n u'platformVersion': u'45.0a1',\n u'vendor': u'Mozilla',\n u'version': u'45.0a1',\n u'xpcomAbi': u'x86_64-gcc3'}"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 40, "cell_type": "code", "source": "sqlContext.jsonRDD(sc.parallelize([parts['application']]).map(lambda d: json.dumps(parts['application']))).printSchema()", "outputs": [{"output_type": "stream", "name": "stdout", "text": "root\n |-- architecture: string (nullable = true)\n |-- buildId: string (nullable = true)\n |-- channel: string (nullable = true)\n |-- name: string (nullable = true)\n |-- platformVersion: string (nullable = true)\n |-- vendor: string (nullable = true)\n |-- version: string (nullable = true)\n |-- xpcomAbi: string (nullable = true)\n\n"}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": 41, "cell_type": "code", "source": "environment_schema = sqlContext.jsonRDD(sc.parallelize([parts['root']]).map(lambda d: json.dumps(parts['root']))).schema", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}, {"execution_count": 42, "cell_type": "code", "source": "environment_schema", "outputs": [{"execution_count": 42, "output_type": "execute_result", "data": {"text/plain": "StructType(List(StructField(clientId,StringType,true),StructField(creationDate,StringType,true),StructField(id,StringType,true),StructField(type,StringType,true),StructField(version,LongType,true)))"}, "metadata": {}}], "metadata": {"collapsed": false, "trusted": true}}, {"execution_count": null, "cell_type": "code", "source": "", "outputs": [], "metadata": {"collapsed": true, "trusted": true}}], "nbformat": 4, "metadata": {"kernelspec": {"display_name": "Python 2", "name": "python2", "language": "python"}, "language_info": {"mimetype": "text/x-python", "nbconvert_exporter": "python", "version": "2.7.9", "name": "python", "file_extension": ".py", "pygments_lexer": "ipython2", "codemirror_mode": {"version": 2, "name": "ipython"}}}}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment