{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## This notebook is part of the Apache Spark training delivered by CERN-IT\n", "### Spark SQL Hands-On Lab with Solutions\n", "Contact: Luca.Canali@cern.ch" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Run this notebook from Jupyter with Python kernel\n", "- When using on CERN SWAN, do not attach the notebook to a Spark cluster, but rather run locally on the SWAN container\n", "- If running this outside CERN SWAN, plese make sure to tha PySpark installed: `pip install pyspark`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Examples datasets\n", "The following examples use sample data provided in the repository. \n", "We will use the movielens dataset from Kaggle, credits: https://www.kaggle.com/grouplens/movielens-20m-dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create Spark Session, you need this to work with Spark\n", "from pyspark.sql import SparkSession\n", "spark = SparkSession.builder \\\n", " .appName(\"My spark example app\") \\\n", " .master(\"local[*]\") \\\n", " .config(\"spark.driver.memory\",\"8g\") \\\n", " .config(\"spark.ui.showConsoleProgress\", \"false\") \\\n", " .getOrCreate()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "

SparkSession - in-memory

\n", " \n", "
\n", "

SparkContext

\n", "\n", "

Spark UI

\n", "\n", "
\n", "
Version
\n", "
v3.3.1
\n", "
Master
\n", "
local[*]
\n", "
AppName
\n", "
My spark example app
\n", "
\n", "
\n", " \n", "
\n", " " ], "text/plain": [ "" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spark" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# sets the path to the directory with datafiles\n", "PATH = \"../data/\"" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "scrolled": false }, "outputs": [], "source": [ "ratings = spark.read.option(\"header\",\"true\").csv(PATH + \"ratings1.csv.gz\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+-------+------+----------+\n", "|userId|movieId|rating| timestamp|\n", "+------+-------+------+----------+\n", "| 1| 110| 1.0|1425941529|\n", "| 1| 147| 4.5|1425942435|\n", "| 1| 858| 5.0|1425941523|\n", "| 1| 1221| 5.0|1425941546|\n", "| 1| 1246| 5.0|1425941556|\n", "+------+-------+------+----------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "ratings.show(5)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- userId: string (nullable = true)\n", " |-- movieId: string (nullable = true)\n", " |-- rating: string (nullable = true)\n", " |-- timestamp: string (nullable = true)\n", "\n" ] } ], "source": [ "ratings.printSchema()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 7.62 ms, sys: 6.18 ms, total: 13.8 ms\n", "Wall time: 26.9 s\n" ] } ], "source": [ "# infer schema needs to go through the data to estimate the schema, this takes time\n", "\n", "%time ratings = spark.read.option(\"header\",\"true\").option(\"inferSchema\", \"true\").csv(PATH + \"ratings1.csv.gz\")\n", "\n", "# note ratings*.csv.gz will read rating1.csv.gz and ratings2.csv.gz, more data, however slower to run\n", "# spark.read.option(\"header\",\"true\").option(\"inferSchema\", \"true\").csv(PATH + \"ratings*.csv.gz\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- userId: integer (nullable = true)\n", " |-- movieId: integer (nullable = true)\n", " |-- rating: double (nullable = true)\n", " |-- timestamp: integer (nullable = true)\n", "\n" ] } ], "source": [ "ratings.printSchema()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# movielens dataset\n", "movies = spark.read.option(\"header\",\"true\").option(\"inferSchema\", \"true\").csv(PATH + \"movies.csv.gz\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+----------------------------------+-------------------------------------------+\n", "|movieId|title |genres |\n", "+-------+----------------------------------+-------------------------------------------+\n", "|1 |Toy Story (1995) |Adventure|Animation|Children|Comedy|Fantasy|\n", "|2 |Jumanji (1995) |Adventure|Children|Fantasy |\n", "|3 |Grumpier Old Men (1995) |Comedy|Romance |\n", "|4 |Waiting to Exhale (1995) |Comedy|Drama|Romance |\n", "|5 |Father of the Bride Part II (1995)|Comedy |\n", "+-------+----------------------------------+-------------------------------------------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "movies.show(5, False)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- movieId: integer (nullable = true)\n", " |-- title: string (nullable = true)\n", " |-- genres: string (nullable = true)\n", "\n" ] } ], "source": [ "movies.printSchema()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "tags = spark.read.option(\"header\",\"true\").option(\"inferSchema\", \"true\").csv(PATH + \"tags.csv.gz\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+-------+----------+----------+\n", "|userId|movieId| tag| timestamp|\n", "+------+-------+----------+----------+\n", "| 1| 318| narrated|1425942391|\n", "| 20| 4306|Dreamworks|1459855607|\n", "| 20| 89302| England|1400778834|\n", "| 20| 89302| espionage|1400778836|\n", "| 20| 89302| jazz|1400778841|\n", "+------+-------+----------+----------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "tags.show(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Register the dataframes as Spark Temporary Views" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "ratings.createOrReplaceTempView(\"ratings\")\n", "movies.createOrReplaceTempView(\"movies\")\n", "tags.createOrReplaceTempView(\"tags\")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "== Physical Plan ==\n", "*(1) Filter (isnotnull(movieId#85) AND (movieId#85 = 1))\n", "+- FileScan csv [movieId#85,title#86] Batched: false, DataFilters: [isnotnull(movieId#85), (movieId#85 = 1)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/eos/home-c/canali/SWAN_projects/HadoopSparkTraining/Spark/data/m..., PartitionFilters: [], PushedFilters: [IsNotNull(movieId), EqualTo(movieId,1)], ReadSchema: struct\n", "\n", "\n" ] } ], "source": [ "# note what happens when we query a table in a csv file with a filter\n", "spark.sql(\"select movieId, title from movies where movieId=1\").explain()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# cache the tables, to improve the performance of the rest of the queries in the notebook\n", "# note: default caching level is MEMORY_AND_DISK (i.e. caching in memory if enough heap is available)\n", "# note: caching is lazily executed, so a count() action is added to make the operation happen\n", "# this operation may take a couple of minutes\n", "\n", "r = ratings.cache().count()\n", "m = movies.cache().count()\n", "t = tags.cache().count()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Num ratings = 13012144\n", "Num tags = 753170\n", "Num movies = 45843\n" ] } ], "source": [ "print(f\"Num ratings = {r}\\nNum tags = {t}\\nNum movies = {m}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### SQL Queries on the loaded tables\n", "#### 1) How many movies produced per year?" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-------+----------------------------------+-------------------------------------------+----+\n", "|movieId|title |genres |Year|\n", "+-------+----------------------------------+-------------------------------------------+----+\n", "|1 |Toy Story (1995) |Adventure|Animation|Children|Comedy|Fantasy|1995|\n", "|2 |Jumanji (1995) |Adventure|Children|Fantasy |1995|\n", "|3 |Grumpier Old Men (1995) |Comedy|Romance |1995|\n", "|4 |Waiting to Exhale (1995) |Comedy|Drama|Romance |1995|\n", "|5 |Father of the Bride Part II (1995)|Comedy |1995|\n", "+-------+----------------------------------+-------------------------------------------+----+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "# Add the column Year to \"movies\"\n", "movies_year = spark.sql(\"select *, regexp_extract(title,'^(.*) \\\\\\\\(([0-9 \\\\\\\\-]*)\\\\\\\\)$',2) as Year from movies\")\n", "movies_year.show(5,False)\n", "movies_year.createOrReplaceTempView(\"movies_year\")\n", "\n", "# This is the DataFrame API equivalent, not that \\\\ oddly need to be changed to \\\\\\\\\\ when using SQL, at least in this version of Spark\n", "# from pyspark.sql.functions import regexp_extract\n", "# movies_year = movies.withColumn(\"Year\",regexp_extract(\"title\",'^(.*) \\\\(([0-9 \\\\-]*)\\\\)$',2))" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# number of movies per year\n", "m_yr = spark.sql(\"select year, count(1) as count from movies_year group by year order by year\").toPandas()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "%matplotlib notebook\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "plt.style.use('seaborn-darkgrid')" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "/* Put everything inside the global mpl namespace */\n", "/* global mpl */\n", "window.mpl = {};\n", "\n", "mpl.get_websocket_type = function () {\n", " if (typeof WebSocket !== 'undefined') {\n", " return WebSocket;\n", " } else if (typeof MozWebSocket !== 'undefined') {\n", " return MozWebSocket;\n", " } else {\n", " alert(\n", " 'Your browser does not have WebSocket support. ' +\n", " 'Please try Chrome, Safari or Firefox ≥ 6. ' +\n", " 'Firefox 4 and 5 are also supported but you ' +\n", " 'have to enable WebSockets in about:config.'\n", " );\n", " }\n", "};\n", "\n", "mpl.figure = function (figure_id, websocket, ondownload, parent_element) {\n", " this.id = figure_id;\n", "\n", " this.ws = websocket;\n", "\n", " this.supports_binary = this.ws.binaryType !== undefined;\n", "\n", " if (!this.supports_binary) {\n", " var warnings = document.getElementById('mpl-warnings');\n", " if (warnings) {\n", " warnings.style.display = 'block';\n", " warnings.textContent =\n", " 'This browser does not support binary websocket messages. ' +\n", " 'Performance may be slow.';\n", " }\n", " }\n", "\n", " this.imageObj = new Image();\n", "\n", " this.context = undefined;\n", " this.message = undefined;\n", " this.canvas = undefined;\n", " this.rubberband_canvas = undefined;\n", " this.rubberband_context = undefined;\n", " this.format_dropdown = undefined;\n", "\n", " this.image_mode = 'full';\n", "\n", " this.root = document.createElement('div');\n", " this.root.setAttribute('style', 'display: inline-block');\n", " this._root_extra_style(this.root);\n", "\n", " parent_element.appendChild(this.root);\n", "\n", " this._init_header(this);\n", " this._init_canvas(this);\n", " this._init_toolbar(this);\n", "\n", " var fig = this;\n", "\n", " this.waiting = false;\n", "\n", " this.ws.onopen = function () {\n", " fig.send_message('supports_binary', { value: fig.supports_binary });\n", " fig.send_message('send_image_mode', {});\n", " if (fig.ratio !== 1) {\n", " fig.send_message('set_dpi_ratio', { dpi_ratio: fig.ratio });\n", " }\n", " fig.send_message('refresh', {});\n", " };\n", "\n", " this.imageObj.onload = function () {\n", " if (fig.image_mode === 'full') {\n", " // Full images could contain transparency (where diff images\n", " // almost always do), so we need to clear the canvas so that\n", " // there is no ghosting.\n", " fig.context.clearRect(0, 0, fig.canvas.width, fig.canvas.height);\n", " }\n", " fig.context.drawImage(fig.imageObj, 0, 0);\n", " };\n", "\n", " this.imageObj.onunload = function () {\n", " fig.ws.close();\n", " };\n", "\n", " this.ws.onmessage = this._make_on_message_function(this);\n", "\n", " this.ondownload = ondownload;\n", "};\n", "\n", "mpl.figure.prototype._init_header = function () {\n", " var titlebar = document.createElement('div');\n", " titlebar.classList =\n", " 'ui-dialog-titlebar ui-widget-header ui-corner-all ui-helper-clearfix';\n", " var titletext = document.createElement('div');\n", " titletext.classList = 'ui-dialog-title';\n", " titletext.setAttribute(\n", " 'style',\n", " 'width: 100%; text-align: center; padding: 3px;'\n", " );\n", " titlebar.appendChild(titletext);\n", " this.root.appendChild(titlebar);\n", " this.header = titletext;\n", "};\n", "\n", "mpl.figure.prototype._canvas_extra_style = function (_canvas_div) {};\n", "\n", "mpl.figure.prototype._root_extra_style = function (_canvas_div) {};\n", "\n", "mpl.figure.prototype._init_canvas = function () {\n", " var fig = this;\n", "\n", " var canvas_div = (this.canvas_div = document.createElement('div'));\n", " canvas_div.setAttribute(\n", " 'style',\n", " 'border: 1px solid #ddd;' +\n", " 'box-sizing: content-box;' +\n", " 'clear: both;' +\n", " 'min-height: 1px;' +\n", " 'min-width: 1px;' +\n", " 'outline: 0;' +\n", " 'overflow: hidden;' +\n", " 'position: relative;' +\n", " 'resize: both;'\n", " );\n", "\n", " function on_keyboard_event_closure(name) {\n", " return function (event) {\n", " return fig.key_event(event, name);\n", " };\n", " }\n", "\n", " canvas_div.addEventListener(\n", " 'keydown',\n", " on_keyboard_event_closure('key_press')\n", " );\n", " canvas_div.addEventListener(\n", " 'keyup',\n", " on_keyboard_event_closure('key_release')\n", " );\n", "\n", " this._canvas_extra_style(canvas_div);\n", " this.root.appendChild(canvas_div);\n", "\n", " var canvas = (this.canvas = document.createElement('canvas'));\n", " canvas.classList.add('mpl-canvas');\n", " canvas.setAttribute('style', 'box-sizing: content-box;');\n", "\n", " this.context = canvas.getContext('2d');\n", "\n", " var backingStore =\n", " this.context.backingStorePixelRatio ||\n", " this.context.webkitBackingStorePixelRatio ||\n", " this.context.mozBackingStorePixelRatio ||\n", " this.context.msBackingStorePixelRatio ||\n", " this.context.oBackingStorePixelRatio ||\n", " this.context.backingStorePixelRatio ||\n", " 1;\n", "\n", " this.ratio = (window.devicePixelRatio || 1) / backingStore;\n", "\n", " var rubberband_canvas = (this.rubberband_canvas = document.createElement(\n", " 'canvas'\n", " ));\n", " rubberband_canvas.setAttribute(\n", " 'style',\n", " 'box-sizing: content-box; position: absolute; left: 0; top: 0; z-index: 1;'\n", " );\n", "\n", " // Apply a ponyfill if ResizeObserver is not implemented by browser.\n", " if (this.ResizeObserver === undefined) {\n", " if (window.ResizeObserver !== undefined) {\n", " this.ResizeObserver = window.ResizeObserver;\n", " } else {\n", " var obs = _JSXTOOLS_RESIZE_OBSERVER({});\n", " this.ResizeObserver = obs.ResizeObserver;\n", " }\n", " }\n", "\n", " this.resizeObserverInstance = new this.ResizeObserver(function (entries) {\n", " var nentries = entries.length;\n", " for (var i = 0; i < nentries; i++) {\n", " var entry = entries[i];\n", " var width, height;\n", " if (entry.contentBoxSize) {\n", " if (entry.contentBoxSize instanceof Array) {\n", " // Chrome 84 implements new version of spec.\n", " width = entry.contentBoxSize[0].inlineSize;\n", " height = entry.contentBoxSize[0].blockSize;\n", " } else {\n", " // Firefox implements old version of spec.\n", " width = entry.contentBoxSize.inlineSize;\n", " height = entry.contentBoxSize.blockSize;\n", " }\n", " } else {\n", " // Chrome <84 implements even older version of spec.\n", " width = entry.contentRect.width;\n", " height = entry.contentRect.height;\n", " }\n", "\n", " // Keep the size of the canvas and rubber band canvas in sync with\n", " // the canvas container.\n", " if (entry.devicePixelContentBoxSize) {\n", " // Chrome 84 implements new version of spec.\n", " canvas.setAttribute(\n", " 'width',\n", " entry.devicePixelContentBoxSize[0].inlineSize\n", " );\n", " canvas.setAttribute(\n", " 'height',\n", " entry.devicePixelContentBoxSize[0].blockSize\n", " );\n", " } else {\n", " canvas.setAttribute('width', width * fig.ratio);\n", " canvas.setAttribute('height', height * fig.ratio);\n", " }\n", " canvas.setAttribute(\n", " 'style',\n", " 'width: ' + width + 'px; height: ' + height + 'px;'\n", " );\n", "\n", " rubberband_canvas.setAttribute('width', width);\n", " rubberband_canvas.setAttribute('height', height);\n", "\n", " // And update the size in Python. We ignore the initial 0/0 size\n", " // that occurs as the element is placed into the DOM, which should\n", " // otherwise not happen due to the minimum size styling.\n", " if (fig.ws.readyState == 1 && width != 0 && height != 0) {\n", " fig.request_resize(width, height);\n", " }\n", " }\n", " });\n", " this.resizeObserverInstance.observe(canvas_div);\n", "\n", " function on_mouse_event_closure(name) {\n", " return function (event) {\n", " return fig.mouse_event(event, name);\n", " };\n", " }\n", "\n", " rubberband_canvas.addEventListener(\n", " 'mousedown',\n", " on_mouse_event_closure('button_press')\n", " );\n", " rubberband_canvas.addEventListener(\n", " 'mouseup',\n", " on_mouse_event_closure('button_release')\n", " );\n", " rubberband_canvas.addEventListener(\n", " 'dblclick',\n", " on_mouse_event_closure('dblclick')\n", " );\n", " // Throttle sequential mouse events to 1 every 20ms.\n", " rubberband_canvas.addEventListener(\n", " 'mousemove',\n", " on_mouse_event_closure('motion_notify')\n", " );\n", "\n", " rubberband_canvas.addEventListener(\n", " 'mouseenter',\n", " on_mouse_event_closure('figure_enter')\n", " );\n", " rubberband_canvas.addEventListener(\n", " 'mouseleave',\n", " on_mouse_event_closure('figure_leave')\n", " );\n", "\n", " canvas_div.addEventListener('wheel', function (event) {\n", " if (event.deltaY < 0) {\n", " event.step = 1;\n", " } else {\n", " event.step = -1;\n", " }\n", " on_mouse_event_closure('scroll')(event);\n", " });\n", "\n", " canvas_div.appendChild(canvas);\n", " canvas_div.appendChild(rubberband_canvas);\n", "\n", " this.rubberband_context = rubberband_canvas.getContext('2d');\n", " this.rubberband_context.strokeStyle = '#000000';\n", "\n", " this._resize_canvas = function (width, height, forward) {\n", " if (forward) {\n", " canvas_div.style.width = width + 'px';\n", " canvas_div.style.height = height + 'px';\n", " }\n", " };\n", "\n", " // Disable right mouse context menu.\n", " this.rubberband_canvas.addEventListener('contextmenu', function (_e) {\n", " event.preventDefault();\n", " return false;\n", " });\n", "\n", " function set_focus() {\n", " canvas.focus();\n", " canvas_div.focus();\n", " }\n", "\n", " window.setTimeout(set_focus, 100);\n", "};\n", "\n", "mpl.figure.prototype._init_toolbar = function () {\n", " var fig = this;\n", "\n", " var toolbar = document.createElement('div');\n", " toolbar.classList = 'mpl-toolbar';\n", " this.root.appendChild(toolbar);\n", "\n", " function on_click_closure(name) {\n", " return function (_event) {\n", " return fig.toolbar_button_onclick(name);\n", " };\n", " }\n", "\n", " function on_mouseover_closure(tooltip) {\n", " return function (event) {\n", " if (!event.currentTarget.disabled) {\n", " return fig.toolbar_button_onmouseover(tooltip);\n", " }\n", " };\n", " }\n", "\n", " fig.buttons = {};\n", " var buttonGroup = document.createElement('div');\n", " buttonGroup.classList = 'mpl-button-group';\n", " for (var toolbar_ind in mpl.toolbar_items) {\n", " var name = mpl.toolbar_items[toolbar_ind][0];\n", " var tooltip = mpl.toolbar_items[toolbar_ind][1];\n", " var image = mpl.toolbar_items[toolbar_ind][2];\n", " var method_name = mpl.toolbar_items[toolbar_ind][3];\n", "\n", " if (!name) {\n", " /* Instead of a spacer, we start a new button group. */\n", " if (buttonGroup.hasChildNodes()) {\n", " toolbar.appendChild(buttonGroup);\n", " }\n", " buttonGroup = document.createElement('div');\n", " buttonGroup.classList = 'mpl-button-group';\n", " continue;\n", " }\n", "\n", " var button = (fig.buttons[name] = document.createElement('button'));\n", " button.classList = 'mpl-widget';\n", " button.setAttribute('role', 'button');\n", " button.setAttribute('aria-disabled', 'false');\n", " button.addEventListener('click', on_click_closure(method_name));\n", " button.addEventListener('mouseover', on_mouseover_closure(tooltip));\n", "\n", " var icon_img = document.createElement('img');\n", " icon_img.src = '_images/' + image + '.png';\n", " icon_img.srcset = '_images/' + image + '_large.png 2x';\n", " icon_img.alt = tooltip;\n", " button.appendChild(icon_img);\n", "\n", " buttonGroup.appendChild(button);\n", " }\n", "\n", " if (buttonGroup.hasChildNodes()) {\n", " toolbar.appendChild(buttonGroup);\n", " }\n", "\n", " var fmt_picker = document.createElement('select');\n", " fmt_picker.classList = 'mpl-widget';\n", " toolbar.appendChild(fmt_picker);\n", " this.format_dropdown = fmt_picker;\n", "\n", " for (var ind in mpl.extensions) {\n", " var fmt = mpl.extensions[ind];\n", " var option = document.createElement('option');\n", " option.selected = fmt === mpl.default_extension;\n", " option.innerHTML = fmt;\n", " fmt_picker.appendChild(option);\n", " }\n", "\n", " var status_bar = document.createElement('span');\n", " status_bar.classList = 'mpl-message';\n", " toolbar.appendChild(status_bar);\n", " this.message = status_bar;\n", "};\n", "\n", "mpl.figure.prototype.request_resize = function (x_pixels, y_pixels) {\n", " // Request matplotlib to resize the figure. Matplotlib will then trigger a resize in the client,\n", " // which will in turn request a refresh of the image.\n", " this.send_message('resize', { width: x_pixels, height: y_pixels });\n", "};\n", "\n", "mpl.figure.prototype.send_message = function (type, properties) {\n", " properties['type'] = type;\n", " properties['figure_id'] = this.id;\n", " this.ws.send(JSON.stringify(properties));\n", "};\n", "\n", "mpl.figure.prototype.send_draw_message = function () {\n", " if (!this.waiting) {\n", " this.waiting = true;\n", " this.ws.send(JSON.stringify({ type: 'draw', figure_id: this.id }));\n", " }\n", "};\n", "\n", "mpl.figure.prototype.handle_save = function (fig, _msg) {\n", " var format_dropdown = fig.format_dropdown;\n", " var format = format_dropdown.options[format_dropdown.selectedIndex].value;\n", " fig.ondownload(fig, format);\n", "};\n", "\n", "mpl.figure.prototype.handle_resize = function (fig, msg) {\n", " var size = msg['size'];\n", " if (size[0] !== fig.canvas.width || size[1] !== fig.canvas.height) {\n", " fig._resize_canvas(size[0], size[1], msg['forward']);\n", " fig.send_message('refresh', {});\n", " }\n", "};\n", "\n", "mpl.figure.prototype.handle_rubberband = function (fig, msg) {\n", " var x0 = msg['x0'] / fig.ratio;\n", " var y0 = (fig.canvas.height - msg['y0']) / fig.ratio;\n", " var x1 = msg['x1'] / fig.ratio;\n", " var y1 = (fig.canvas.height - msg['y1']) / fig.ratio;\n", " x0 = Math.floor(x0) + 0.5;\n", " y0 = Math.floor(y0) + 0.5;\n", " x1 = Math.floor(x1) + 0.5;\n", " y1 = Math.floor(y1) + 0.5;\n", " var min_x = Math.min(x0, x1);\n", " var min_y = Math.min(y0, y1);\n", " var width = Math.abs(x1 - x0);\n", " var height = Math.abs(y1 - y0);\n", "\n", " fig.rubberband_context.clearRect(\n", " 0,\n", " 0,\n", " fig.canvas.width / fig.ratio,\n", " fig.canvas.height / fig.ratio\n", " );\n", "\n", " fig.rubberband_context.strokeRect(min_x, min_y, width, height);\n", "};\n", "\n", "mpl.figure.prototype.handle_figure_label = function (fig, msg) {\n", " // Updates the figure title.\n", " fig.header.textContent = msg['label'];\n", "};\n", "\n", "mpl.figure.prototype.handle_cursor = function (fig, msg) {\n", " var cursor = msg['cursor'];\n", " switch (cursor) {\n", " case 0:\n", " cursor = 'pointer';\n", " break;\n", " case 1:\n", " cursor = 'default';\n", " break;\n", " case 2:\n", " cursor = 'crosshair';\n", " break;\n", " case 3:\n", " cursor = 'move';\n", " break;\n", " }\n", " fig.rubberband_canvas.style.cursor = cursor;\n", "};\n", "\n", "mpl.figure.prototype.handle_message = function (fig, msg) {\n", " fig.message.textContent = msg['message'];\n", "};\n", "\n", "mpl.figure.prototype.handle_draw = function (fig, _msg) {\n", " // Request the server to send over a new figure.\n", " fig.send_draw_message();\n", "};\n", "\n", "mpl.figure.prototype.handle_image_mode = function (fig, msg) {\n", " fig.image_mode = msg['mode'];\n", "};\n", "\n", "mpl.figure.prototype.handle_history_buttons = function (fig, msg) {\n", " for (var key in msg) {\n", " if (!(key in fig.buttons)) {\n", " continue;\n", " }\n", " fig.buttons[key].disabled = !msg[key];\n", " fig.buttons[key].setAttribute('aria-disabled', !msg[key]);\n", " }\n", "};\n", "\n", "mpl.figure.prototype.handle_navigate_mode = function (fig, msg) {\n", " if (msg['mode'] === 'PAN') {\n", " fig.buttons['Pan'].classList.add('active');\n", " fig.buttons['Zoom'].classList.remove('active');\n", " } else if (msg['mode'] === 'ZOOM') {\n", " fig.buttons['Pan'].classList.remove('active');\n", " fig.buttons['Zoom'].classList.add('active');\n", " } else {\n", " fig.buttons['Pan'].classList.remove('active');\n", " fig.buttons['Zoom'].classList.remove('active');\n", " }\n", "};\n", "\n", "mpl.figure.prototype.updated_canvas_event = function () {\n", " // Called whenever the canvas gets updated.\n", " this.send_message('ack', {});\n", "};\n", "\n", "// A function to construct a web socket function for onmessage handling.\n", "// Called in the figure constructor.\n", "mpl.figure.prototype._make_on_message_function = function (fig) {\n", " return function socket_on_message(evt) {\n", " if (evt.data instanceof Blob) {\n", " var img = evt.data;\n", " if (img.type !== 'image/png') {\n", " /* FIXME: We get \"Resource interpreted as Image but\n", " * transferred with MIME type text/plain:\" errors on\n", " * Chrome. But how to set the MIME type? It doesn't seem\n", " * to be part of the websocket stream */\n", " img.type = 'image/png';\n", " }\n", "\n", " /* Free the memory for the previous frames */\n", " if (fig.imageObj.src) {\n", " (window.URL || window.webkitURL).revokeObjectURL(\n", " fig.imageObj.src\n", " );\n", " }\n", "\n", " fig.imageObj.src = (window.URL || window.webkitURL).createObjectURL(\n", " img\n", " );\n", " fig.updated_canvas_event();\n", " fig.waiting = false;\n", " return;\n", " } else if (\n", " typeof evt.data === 'string' &&\n", " evt.data.slice(0, 21) === 'data:image/png;base64'\n", " ) {\n", " fig.imageObj.src = evt.data;\n", " fig.updated_canvas_event();\n", " fig.waiting = false;\n", " return;\n", " }\n", "\n", " var msg = JSON.parse(evt.data);\n", " var msg_type = msg['type'];\n", "\n", " // Call the \"handle_{type}\" callback, which takes\n", " // the figure and JSON message as its only arguments.\n", " try {\n", " var callback = fig['handle_' + msg_type];\n", " } catch (e) {\n", " console.log(\n", " \"No handler for the '\" + msg_type + \"' message type: \",\n", " msg\n", " );\n", " return;\n", " }\n", "\n", " if (callback) {\n", " try {\n", " // console.log(\"Handling '\" + msg_type + \"' message: \", msg);\n", " callback(fig, msg);\n", " } catch (e) {\n", " console.log(\n", " \"Exception inside the 'handler_\" + msg_type + \"' callback:\",\n", " e,\n", " e.stack,\n", " msg\n", " );\n", " }\n", " }\n", " };\n", "};\n", "\n", "// from http://stackoverflow.com/questions/1114465/getting-mouse-location-in-canvas\n", "mpl.findpos = function (e) {\n", " //this section is from http://www.quirksmode.org/js/events_properties.html\n", " var targ;\n", " if (!e) {\n", " e = window.event;\n", " }\n", " if (e.target) {\n", " targ = e.target;\n", " } else if (e.srcElement) {\n", " targ = e.srcElement;\n", " }\n", " if (targ.nodeType === 3) {\n", " // defeat Safari bug\n", " targ = targ.parentNode;\n", " }\n", "\n", " // pageX,Y are the mouse positions relative to the document\n", " var boundingRect = targ.getBoundingClientRect();\n", " var x = e.pageX - (boundingRect.left + document.body.scrollLeft);\n", " var y = e.pageY - (boundingRect.top + document.body.scrollTop);\n", "\n", " return { x: x, y: y };\n", "};\n", "\n", "/*\n", " * return a copy of an object with only non-object keys\n", " * we need this to avoid circular references\n", " * http://stackoverflow.com/a/24161582/3208463\n", " */\n", "function simpleKeys(original) {\n", " return Object.keys(original).reduce(function (obj, key) {\n", " if (typeof original[key] !== 'object') {\n", " obj[key] = original[key];\n", " }\n", " return obj;\n", " }, {});\n", "}\n", "\n", "mpl.figure.prototype.mouse_event = function (event, name) {\n", " var canvas_pos = mpl.findpos(event);\n", "\n", " if (name === 'button_press') {\n", " this.canvas.focus();\n", " this.canvas_div.focus();\n", " }\n", "\n", " var x = canvas_pos.x * this.ratio;\n", " var y = canvas_pos.y * this.ratio;\n", "\n", " this.send_message(name, {\n", " x: x,\n", " y: y,\n", " button: event.button,\n", " step: event.step,\n", " guiEvent: simpleKeys(event),\n", " });\n", "\n", " /* This prevents the web browser from automatically changing to\n", " * the text insertion cursor when the button is pressed. We want\n", " * to control all of the cursor setting manually through the\n", " * 'cursor' event from matplotlib */\n", " event.preventDefault();\n", " return false;\n", "};\n", "\n", "mpl.figure.prototype._key_event_extra = function (_event, _name) {\n", " // Handle any extra behaviour associated with a key event\n", "};\n", "\n", "mpl.figure.prototype.key_event = function (event, name) {\n", " // Prevent repeat events\n", " if (name === 'key_press') {\n", " if (event.key === this._key) {\n", " return;\n", " } else {\n", " this._key = event.key;\n", " }\n", " }\n", " if (name === 'key_release') {\n", " this._key = null;\n", " }\n", "\n", " var value = '';\n", " if (event.ctrlKey && event.key !== 'Control') {\n", " value += 'ctrl+';\n", " }\n", " else if (event.altKey && event.key !== 'Alt') {\n", " value += 'alt+';\n", " }\n", " else if (event.shiftKey && event.key !== 'Shift') {\n", " value += 'shift+';\n", " }\n", "\n", " value += 'k' + event.key;\n", "\n", " this._key_event_extra(event, name);\n", "\n", " this.send_message(name, { key: value, guiEvent: simpleKeys(event) });\n", " return false;\n", "};\n", "\n", "mpl.figure.prototype.toolbar_button_onclick = function (name) {\n", " if (name === 'download') {\n", " this.handle_save(this, null);\n", " } else {\n", " this.send_message('toolbar_button', { name: name });\n", " }\n", "};\n", "\n", "mpl.figure.prototype.toolbar_button_onmouseover = function (tooltip) {\n", " this.message.textContent = tooltip;\n", "};\n", "\n", "///////////////// REMAINING CONTENT GENERATED BY embed_js.py /////////////////\n", "// prettier-ignore\n", "var _JSXTOOLS_RESIZE_OBSERVER=function(A){var t,i=new WeakMap,n=new WeakMap,a=new WeakMap,r=new WeakMap,o=new Set;function s(e){if(!(this instanceof s))throw new TypeError(\"Constructor requires 'new' operator\");i.set(this,e)}function h(){throw new TypeError(\"Function is not a constructor\")}function c(e,t,i,n){e=0 in arguments?Number(arguments[0]):0,t=1 in arguments?Number(arguments[1]):0,i=2 in arguments?Number(arguments[2]):0,n=3 in arguments?Number(arguments[3]):0,this.right=(this.x=this.left=e)+(this.width=i),this.bottom=(this.y=this.top=t)+(this.height=n),Object.freeze(this)}function d(){t=requestAnimationFrame(d);var s=new WeakMap,p=new Set;o.forEach((function(t){r.get(t).forEach((function(i){var r=t instanceof window.SVGElement,o=a.get(t),d=r?0:parseFloat(o.paddingTop),f=r?0:parseFloat(o.paddingRight),l=r?0:parseFloat(o.paddingBottom),u=r?0:parseFloat(o.paddingLeft),g=r?0:parseFloat(o.borderTopWidth),m=r?0:parseFloat(o.borderRightWidth),w=r?0:parseFloat(o.borderBottomWidth),b=u+f,F=d+l,v=(r?0:parseFloat(o.borderLeftWidth))+m,W=g+w,y=r?0:t.offsetHeight-W-t.clientHeight,E=r?0:t.offsetWidth-v-t.clientWidth,R=b+v,z=F+W,M=r?t.width:parseFloat(o.width)-R-E,O=r?t.height:parseFloat(o.height)-z-y;if(n.has(t)){var k=n.get(t);if(k[0]===M&&k[1]===O)return}n.set(t,[M,O]);var S=Object.create(h.prototype);S.target=t,S.contentRect=new c(u,d,M,O),s.has(i)||(s.set(i,[]),p.add(i)),s.get(i).push(S)}))})),p.forEach((function(e){i.get(e).call(e,s.get(e),e)}))}return s.prototype.observe=function(i){if(i instanceof window.Element){r.has(i)||(r.set(i,new Set),o.add(i),a.set(i,window.getComputedStyle(i)));var n=r.get(i);n.has(this)||n.add(this),cancelAnimationFrame(t),t=requestAnimationFrame(d)}},s.prototype.unobserve=function(i){if(i instanceof window.Element&&r.has(i)){var n=r.get(i);n.has(this)&&(n.delete(this),n.size||(r.delete(i),o.delete(i))),n.size||r.delete(i),o.size||cancelAnimationFrame(t)}},A.DOMRectReadOnly=c,A.ResizeObserver=s,A.ResizeObserverEntry=h,A}; // eslint-disable-line\n", "mpl.toolbar_items = [[\"Home\", \"Reset original view\", \"fa fa-home icon-home\", \"home\"], [\"Back\", \"Back to previous view\", \"fa fa-arrow-left icon-arrow-left\", \"back\"], [\"Forward\", \"Forward to next view\", \"fa fa-arrow-right icon-arrow-right\", \"forward\"], [\"\", \"\", \"\", \"\"], [\"Pan\", \"Left button pans, Right button zooms\\nx/y fixes axis, CTRL fixes aspect\", \"fa fa-arrows icon-move\", \"pan\"], [\"Zoom\", \"Zoom to rectangle\\nx/y fixes axis, CTRL fixes aspect\", \"fa fa-square-o icon-check-empty\", \"zoom\"], [\"\", \"\", \"\", \"\"], [\"Download\", \"Download plot\", \"fa fa-floppy-o icon-save\", \"download\"]];\n", "\n", "mpl.extensions = [\"eps\", \"jpeg\", \"pgf\", \"pdf\", \"png\", \"ps\", \"raw\", \"svg\", \"tif\"];\n", "\n", "mpl.default_extension = \"png\";/* global mpl */\n", "\n", "var comm_websocket_adapter = function (comm) {\n", " // Create a \"websocket\"-like object which calls the given IPython comm\n", " // object with the appropriate methods. Currently this is a non binary\n", " // socket, so there is still some room for performance tuning.\n", " var ws = {};\n", "\n", " ws.binaryType = comm.kernel.ws.binaryType;\n", " ws.readyState = comm.kernel.ws.readyState;\n", " function updateReadyState(_event) {\n", " if (comm.kernel.ws) {\n", " ws.readyState = comm.kernel.ws.readyState;\n", " } else {\n", " ws.readyState = 3; // Closed state.\n", " }\n", " }\n", " comm.kernel.ws.addEventListener('open', updateReadyState);\n", " comm.kernel.ws.addEventListener('close', updateReadyState);\n", " comm.kernel.ws.addEventListener('error', updateReadyState);\n", "\n", " ws.close = function () {\n", " comm.close();\n", " };\n", " ws.send = function (m) {\n", " //console.log('sending', m);\n", " comm.send(m);\n", " };\n", " // Register the callback with on_msg.\n", " comm.on_msg(function (msg) {\n", " //console.log('receiving', msg['content']['data'], msg);\n", " var data = msg['content']['data'];\n", " if (data['blob'] !== undefined) {\n", " data = {\n", " data: new Blob(msg['buffers'], { type: data['blob'] }),\n", " };\n", " }\n", " // Pass the mpl event to the overridden (by mpl) onmessage function.\n", " ws.onmessage(data);\n", " });\n", " return ws;\n", "};\n", "\n", "mpl.mpl_figure_comm = function (comm, msg) {\n", " // This is the function which gets called when the mpl process\n", " // starts-up an IPython Comm through the \"matplotlib\" channel.\n", "\n", " var id = msg.content.data.id;\n", " // Get hold of the div created by the display call when the Comm\n", " // socket was opened in Python.\n", " var element = document.getElementById(id);\n", " var ws_proxy = comm_websocket_adapter(comm);\n", "\n", " function ondownload(figure, _format) {\n", " window.open(figure.canvas.toDataURL());\n", " }\n", "\n", " var fig = new mpl.figure(id, ws_proxy, ondownload, element);\n", "\n", " // Call onopen now - mpl needs it, as it is assuming we've passed it a real\n", " // web socket which is closed, not our websocket->open comm proxy.\n", " ws_proxy.onopen();\n", "\n", " fig.parent_element = element;\n", " fig.cell_info = mpl.find_output_cell(\"
\");\n", " if (!fig.cell_info) {\n", " console.error('Failed to find cell for figure', id, fig);\n", " return;\n", " }\n", " fig.cell_info[0].output_area.element.on(\n", " 'cleared',\n", " { fig: fig },\n", " fig._remove_fig_handler\n", " );\n", "};\n", "\n", "mpl.figure.prototype.handle_close = function (fig, msg) {\n", " var width = fig.canvas.width / fig.ratio;\n", " fig.cell_info[0].output_area.element.off(\n", " 'cleared',\n", " fig._remove_fig_handler\n", " );\n", " fig.resizeObserverInstance.unobserve(fig.canvas_div);\n", "\n", " // Update the output cell to use the data from the current canvas.\n", " fig.push_to_output();\n", " var dataURL = fig.canvas.toDataURL();\n", " // Re-enable the keyboard manager in IPython - without this line, in FF,\n", " // the notebook keyboard shortcuts fail.\n", " IPython.keyboard_manager.enable();\n", " fig.parent_element.innerHTML =\n", " '';\n", " fig.close_ws(fig, msg);\n", "};\n", "\n", "mpl.figure.prototype.close_ws = function (fig, msg) {\n", " fig.send_message('closing', msg);\n", " // fig.ws.close()\n", "};\n", "\n", "mpl.figure.prototype.push_to_output = function (_remove_interactive) {\n", " // Turn the data on the canvas into data in the output cell.\n", " var width = this.canvas.width / this.ratio;\n", " var dataURL = this.canvas.toDataURL();\n", " this.cell_info[1]['text/html'] =\n", " '';\n", "};\n", "\n", "mpl.figure.prototype.updated_canvas_event = function () {\n", " // Tell IPython that the notebook contents must change.\n", " IPython.notebook.set_dirty(true);\n", " this.send_message('ack', {});\n", " var fig = this;\n", " // Wait a second, then push the new image to the DOM so\n", " // that it is saved nicely (might be nice to debounce this).\n", " setTimeout(function () {\n", " fig.push_to_output();\n", " }, 1000);\n", "};\n", "\n", "mpl.figure.prototype._init_toolbar = function () {\n", " var fig = this;\n", "\n", " var toolbar = document.createElement('div');\n", " toolbar.classList = 'btn-toolbar';\n", " this.root.appendChild(toolbar);\n", "\n", " function on_click_closure(name) {\n", " return function (_event) {\n", " return fig.toolbar_button_onclick(name);\n", " };\n", " }\n", "\n", " function on_mouseover_closure(tooltip) {\n", " return function (event) {\n", " if (!event.currentTarget.disabled) {\n", " return fig.toolbar_button_onmouseover(tooltip);\n", " }\n", " };\n", " }\n", "\n", " fig.buttons = {};\n", " var buttonGroup = document.createElement('div');\n", " buttonGroup.classList = 'btn-group';\n", " var button;\n", " for (var toolbar_ind in mpl.toolbar_items) {\n", " var name = mpl.toolbar_items[toolbar_ind][0];\n", " var tooltip = mpl.toolbar_items[toolbar_ind][1];\n", " var image = mpl.toolbar_items[toolbar_ind][2];\n", " var method_name = mpl.toolbar_items[toolbar_ind][3];\n", "\n", " if (!name) {\n", " /* Instead of a spacer, we start a new button group. */\n", " if (buttonGroup.hasChildNodes()) {\n", " toolbar.appendChild(buttonGroup);\n", " }\n", " buttonGroup = document.createElement('div');\n", " buttonGroup.classList = 'btn-group';\n", " continue;\n", " }\n", "\n", " button = fig.buttons[name] = document.createElement('button');\n", " button.classList = 'btn btn-default';\n", " button.href = '#';\n", " button.title = name;\n", " button.innerHTML = '';\n", " button.addEventListener('click', on_click_closure(method_name));\n", " button.addEventListener('mouseover', on_mouseover_closure(tooltip));\n", " buttonGroup.appendChild(button);\n", " }\n", "\n", " if (buttonGroup.hasChildNodes()) {\n", " toolbar.appendChild(buttonGroup);\n", " }\n", "\n", " // Add the status bar.\n", " var status_bar = document.createElement('span');\n", " status_bar.classList = 'mpl-message pull-right';\n", " toolbar.appendChild(status_bar);\n", " this.message = status_bar;\n", "\n", " // Add the close button to the window.\n", " var buttongrp = document.createElement('div');\n", " buttongrp.classList = 'btn-group inline pull-right';\n", " button = document.createElement('button');\n", " button.classList = 'btn btn-mini btn-primary';\n", " button.href = '#';\n", " button.title = 'Stop Interaction';\n", " button.innerHTML = '';\n", " button.addEventListener('click', function (_evt) {\n", " fig.handle_close(fig, {});\n", " });\n", " button.addEventListener(\n", " 'mouseover',\n", " on_mouseover_closure('Stop Interaction')\n", " );\n", " buttongrp.appendChild(button);\n", " var titlebar = this.root.querySelector('.ui-dialog-titlebar');\n", " titlebar.insertBefore(buttongrp, titlebar.firstChild);\n", "};\n", "\n", "mpl.figure.prototype._remove_fig_handler = function (event) {\n", " var fig = event.data.fig;\n", " if (event.target !== this) {\n", " // Ignore bubbled events from children.\n", " return;\n", " }\n", " fig.close_ws(fig, {});\n", "};\n", "\n", "mpl.figure.prototype._root_extra_style = function (el) {\n", " el.style.boxSizing = 'content-box'; // override notebook setting of border-box.\n", "};\n", "\n", "mpl.figure.prototype._canvas_extra_style = function (el) {\n", " // this is important to make the div 'focusable\n", " el.setAttribute('tabindex', 0);\n", " // reach out to IPython and tell the keyboard manager to turn it's self\n", " // off when our div gets focus\n", "\n", " // location in version 3\n", " if (IPython.notebook.keyboard_manager) {\n", " IPython.notebook.keyboard_manager.register_events(el);\n", " } else {\n", " // location in version 2\n", " IPython.keyboard_manager.register_events(el);\n", " }\n", "};\n", "\n", "mpl.figure.prototype._key_event_extra = function (event, _name) {\n", " var manager = IPython.notebook.keyboard_manager;\n", " if (!manager) {\n", " manager = IPython.keyboard_manager;\n", " }\n", "\n", " // Check for shift+enter\n", " if (event.shiftKey && event.which === 13) {\n", " this.canvas_div.blur();\n", " // select the cell after this one\n", " var index = IPython.notebook.find_cell_index(this.cell_info[0]);\n", " IPython.notebook.select(index + 1);\n", " }\n", "};\n", "\n", "mpl.figure.prototype.handle_save = function (fig, _msg) {\n", " fig.ondownload(fig, null);\n", "};\n", "\n", "mpl.find_output_cell = function (html_output) {\n", " // Return the cell and output element which can be found *uniquely* in the notebook.\n", " // Note - this is a bit hacky, but it is done because the \"notebook_saving.Notebook\"\n", " // IPython event is triggered only after the cells have been serialised, which for\n", " // our purposes (turning an active figure into a static one), is too late.\n", " var cells = IPython.notebook.get_cells();\n", " var ncells = cells.length;\n", " for (var i = 0; i < ncells; i++) {\n", " var cell = cells[i];\n", " if (cell.cell_type === 'code') {\n", " for (var j = 0; j < cell.output_area.outputs.length; j++) {\n", " var data = cell.output_area.outputs[j];\n", " if (data.data) {\n", " // IPython >= 3 moved mimebundle to data attribute of output\n", " data = data.data;\n", " }\n", " if (data['text/html'] === html_output) {\n", " return [cell, data, j];\n", " }\n", " }\n", " }\n", " }\n", "};\n", "\n", "// Register the function which deals with the matplotlib target/channel.\n", "// The kernel may be null if the page has been refreshed.\n", "if (IPython.notebook.kernel !== null) {\n", " IPython.notebook.kernel.comm_manager.register_target(\n", " 'matplotlib',\n", " mpl.mpl_figure_comm\n", " );\n", "}\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "m_yr.plot(x='year',y='count',kind='line', title='Movies per year');" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 2) Top movies by number of ratings" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlecount(1)
0Forrest Gump (1994)45782
1Shawshank Redemption, The (1994)45546
2Pulp Fiction (1994)43755
3Silence of the Lambs, The (1991)41807
4Matrix, The (1999)38860
\n", "
" ], "text/plain": [ " title count(1)\n", "0 Forrest Gump (1994) 45782\n", "1 Shawshank Redemption, The (1994) 45546\n", "2 Pulp Fiction (1994) 43755\n", "3 Silence of the Lambs, The (1991) 41807\n", "4 Matrix, The (1999) 38860" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# A query to perform a join operation between movies and ratings\n", "# Find the highest rated movies\n", "\n", "spark.sql(\"\"\"\n", "select title, count(*) \n", "from movies m, ratings r \n", "where m.movieId = r.movieId\n", "group by title \n", "order by 2 desc\"\"\").limit(5).toPandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### What happened in the background? How did the join happen?\n", "#### How is the query executed? Can we get more information?" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "== Parsed Logical Plan ==\n", "'Sort [2 DESC NULLS LAST], true\n", "+- 'Aggregate ['title], ['title, unresolvedalias('count(1), None)]\n", " +- 'Filter ('m.movieId = 'r.movieId)\n", " +- 'Join Inner\n", " :- 'SubqueryAlias m\n", " : +- 'UnresolvedRelation [movies], [], false\n", " +- 'SubqueryAlias r\n", " +- 'UnresolvedRelation [ratings], [], false\n", "\n", "== Analyzed Logical Plan ==\n", "title: string, count(1): bigint\n", "Sort [count(1)#866L DESC NULLS LAST], true\n", "+- Aggregate [title#86], [title#86, count(1) AS count(1)#866L]\n", " +- Filter (movieId#85 = movieId#62)\n", " +- Join Inner\n", " :- SubqueryAlias m\n", " : +- SubqueryAlias movies\n", " : +- View (`movies`, [movieId#85,title#86,genres#87])\n", " : +- Relation [movieId#85,title#86,genres#87] csv\n", " +- SubqueryAlias r\n", " +- SubqueryAlias ratings\n", " +- View (`ratings`, [userId#61,movieId#62,rating#63,timestamp#64])\n", " +- Relation [userId#61,movieId#62,rating#63,timestamp#64] csv\n", "\n", "== Optimized Logical Plan ==\n", "Sort [count(1)#866L DESC NULLS LAST], true\n", "+- Aggregate [title#86], [title#86, count(1) AS count(1)#866L]\n", " +- Project [title#86]\n", " +- Join Inner, (movieId#85 = movieId#62)\n", " :- Project [movieId#85, title#86]\n", " : +- Filter isnotnull(movieId#85)\n", " : +- InMemoryRelation [movieId#85, title#86, genres#87], StorageLevel(disk, memory, deserialized, 1 replicas)\n", " : +- FileScan csv [movieId#85,title#86,genres#87] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/eos/home-c/canali/SWAN_projects/HadoopSparkTraining/Spark/data/m..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct\n", " +- Project [movieId#62]\n", " +- Filter isnotnull(movieId#62)\n", " +- InMemoryRelation [userId#61, movieId#62, rating#63, timestamp#64], StorageLevel(disk, memory, deserialized, 1 replicas)\n", " +- FileScan csv [userId#61,movieId#62,rating#63,timestamp#64] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/eos/home-c/canali/SWAN_projects/HadoopSparkTraining/Spark/data/r..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct\n", "\n", "== Physical Plan ==\n", "AdaptiveSparkPlan isFinalPlan=false\n", "+- Sort [count(1)#866L DESC NULLS LAST], true, 0\n", " +- Exchange rangepartitioning(count(1)#866L DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [id=#514]\n", " +- HashAggregate(keys=[title#86], functions=[count(1)], output=[title#86, count(1)#866L])\n", " +- Exchange hashpartitioning(title#86, 200), ENSURE_REQUIREMENTS, [id=#511]\n", " +- HashAggregate(keys=[title#86], functions=[partial_count(1)], output=[title#86, count#975L])\n", " +- Project [title#86]\n", " +- BroadcastHashJoin [movieId#85], [movieId#62], Inner, BuildLeft, false\n", " :- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#506]\n", " : +- Filter isnotnull(movieId#85)\n", " : +- InMemoryTableScan [movieId#85, title#86], [isnotnull(movieId#85)]\n", " : +- InMemoryRelation [movieId#85, title#86, genres#87], StorageLevel(disk, memory, deserialized, 1 replicas)\n", " : +- FileScan csv [movieId#85,title#86,genres#87] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/eos/home-c/canali/SWAN_projects/HadoopSparkTraining/Spark/data/m..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct\n", " +- Filter isnotnull(movieId#62)\n", " +- InMemoryTableScan [movieId#62], [isnotnull(movieId#62)]\n", " +- InMemoryRelation [userId#61, movieId#62, rating#63, timestamp#64], StorageLevel(disk, memory, deserialized, 1 replicas)\n", " +- FileScan csv [userId#61,movieId#62,rating#63,timestamp#64] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/eos/home-c/canali/SWAN_projects/HadoopSparkTraining/Spark/data/r..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct\n", "\n" ] } ], "source": [ "spark.sql(\"\"\"\n", "select title, count(*) \n", "from movies m, ratings r \n", "where m.movieId = r.movieId\n", "group by title \n", "order by 2 desc\"\"\").explain(True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### 3) Highly rated movies\n", "Find the top 5 highly rated movies" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-----------------------------------+----------+\n", "|title |avg_rating|\n", "+-----------------------------------+----------+\n", "|The Hardy Bucks Movie (2013) |5.0 |\n", "|Naked Souls (1996) |5.0 |\n", "|Paul Goodman Changed My Life (2011)|5.0 |\n", "|Jimi Plays Berkeley (2012) |5.0 |\n", "|Punk in London (1977) |5.0 |\n", "+-----------------------------------+----------+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "spark.sql(\"\"\"select title, avg(rating) as avg_rating from movies m, ratings r\n", " where m.movieId = r.movieId\n", " group by title\n", " order by 2 desc\"\"\").show(5, False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Drill down on the top entries:\n", " - How many reviews contributed to this rating?" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+-----------------------------------+----------+-----+\n", "|title |avg_rating|count|\n", "+-----------------------------------+----------+-----+\n", "|The Hardy Bucks Movie (2013) |5.0 |1 |\n", "|Naked Souls (1996) |5.0 |1 |\n", "|Paul Goodman Changed My Life (2011)|5.0 |1 |\n", "|Jimi Plays Berkeley (2012) |5.0 |1 |\n", "|Punk in London (1977) |5.0 |1 |\n", "+-----------------------------------+----------+-----+\n", "only showing top 5 rows\n", "\n" ] } ], "source": [ "spark.sql(\"\"\"select title, avg(rating) as avg_rating, count(*) as count from movies m, ratings r\n", " where m.movieId = r.movieId\n", " group by title\n", " order by 2 desc\"\"\").show(5, False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Lets only take in account movies that have more than 100 reviews" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titleavg_ratingcount
0Planet Earth (2006)4.467391368
1Band of Brothers (2001)4.431655139
2Shawshank Redemption, The (1994)4.42633845546
3Godfather, The (1972)4.33564828582
4Usual Suspects, The (1995)4.29949429635
5Godfather: Part II, The (1974)4.26671818319
6Seven Samurai (Shichinin no samurai) (1954)4.2655076900
7Schindler's List (1993)4.26194533780
8The Blue Planet (2001)4.234615130
9Fight Club (1999)4.23203429931
10One Flew Over the Cuckoo's Nest (1975)4.23085219937
1112 Angry Men (1957)4.2295208374
12Rear Window (1954)4.22951110542
13Paths of Glory (1957)4.2181402150
14Casablanca (1942)4.21529214903
15Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)4.2142143975
16North by Northwest (1959)4.2114889445
17Third Man, The (1949)4.2101963825
18Spirited Away (Sen to Chihiro no kamikakushi) ...4.20965610398
19Dr. Strangelove or: How I Learned to Stop Worr...4.20944113992
\n", "
" ], "text/plain": [ " title avg_rating count\n", "0 Planet Earth (2006) 4.467391 368\n", "1 Band of Brothers (2001) 4.431655 139\n", "2 Shawshank Redemption, The (1994) 4.426338 45546\n", "3 Godfather, The (1972) 4.335648 28582\n", "4 Usual Suspects, The (1995) 4.299494 29635\n", "5 Godfather: Part II, The (1974) 4.266718 18319\n", "6 Seven Samurai (Shichinin no samurai) (1954) 4.265507 6900\n", "7 Schindler's List (1993) 4.261945 33780\n", "8 The Blue Planet (2001) 4.234615 130\n", "9 Fight Club (1999) 4.232034 29931\n", "10 One Flew Over the Cuckoo's Nest (1975) 4.230852 19937\n", "11 12 Angry Men (1957) 4.229520 8374\n", "12 Rear Window (1954) 4.229511 10542\n", "13 Paths of Glory (1957) 4.218140 2150\n", "14 Casablanca (1942) 4.215292 14903\n", "15 Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 4.214214 3975\n", "16 North by Northwest (1959) 4.211488 9445\n", "17 Third Man, The (1949) 4.210196 3825\n", "18 Spirited Away (Sen to Chihiro no kamikakushi) ... 4.209656 10398\n", "19 Dr. Strangelove or: How I Learned to Stop Worr... 4.209441 13992" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spark.sql(\"\"\"select title, avg(rating) as avg_rating, count(*) as count from movies m, ratings r\n", " where m.movieId = r.movieId\n", " group by title\n", " having count(*) > 100\n", " order by 2 desc\"\"\").limit(20).toPandas()" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "#### 4) Find the top rated movie of every year since 2000" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "avg_ratings = spark.sql(\"\"\"select year, title, round(avg(rating),2) as avg_rating, count(*) as count\n", " from movies_year m, ratings r where m.movieId = r.movieId \n", " group by year, title\n", " having count(*) > 100\"\"\")\n", "\n", "avg_ratings.createOrReplaceTempView(\"avg_ratings\")\n", "\n", "# note this is just the definition of a helper view\n", "# because of lazy execution no query is run at this step" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+----+--------------------------------------------------------------------------+----------+\n", "|year|title |avg_rating|\n", "+----+--------------------------------------------------------------------------+----------+\n", "|2001|Band of Brothers (2001) |4.43 |\n", "|2002|City of God (Cidade de Deus) (2002) |4.19 |\n", "|2003|Fog of War: Eleven Lessons from the Life of Robert S. McNamara, The (2003)|4.11 |\n", "|2003|Lord of the Rings: The Return of the King, The (2003) |4.11 |\n", "|2004|Voices from the List (2004) |4.15 |\n", "|2005|49 Up (2005) |4.06 |\n", "|2006|Planet Earth (2006) |4.47 |\n", "|2007|Like Stars on Earth (Taare Zameen Par) (2007) |4.04 |\n", "|2008|Dark Knight, The (2008) |4.18 |\n", "|2009|Secret in Their Eyes, The (El secreto de sus ojos) (2009) |4.03 |\n", "|2010|Inception (2010) |4.16 |\n", "|2011|Intouchables (2011) |4.12 |\n", "|2012|Hunt, The (Jagten) (2012) |4.07 |\n", "|2013|Doctor Who: The Time of the Doctor (2013) |4.07 |\n", "|2014|Wild Tales (2014) |4.15 |\n", "|2015|The Jinx: The Life and Deaths of Robert Durst (2015) |4.09 |\n", "|2016|Piper (2016) |4.1 |\n", "|2017|Dunkirk (2017) |4.11 |\n", "+----+--------------------------------------------------------------------------+----------+\n", "\n" ] } ], "source": [ "# the query for top-rated movies is run here, triggered by the action to show the first 20 rows\n", "\n", "spark.sql(\"\"\"select a.year, a.title, avg_rating from avg_ratings a,\n", " (select year, max(avg_rating) as max_rating from avg_ratings group by year) m\n", " where a.year = m.year\n", " and a.avg_rating = m.max_rating\n", " and a.year > 2000\n", " order by year\"\"\").show(20, False)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "# End the Spark application\n", "spark.stop()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 1 }