From 92da2a7ad6faf961d45495ec95ed21c9122140ed Mon Sep 17 00:00:00 2001 From: Manfred Cheung Date: Fri, 12 Dec 2025 03:40:16 -0500 Subject: [PATCH 1/7] add brightkite checkin demo and uncomment link to icij fincen demo in gfql notebook list --- .../social/brightkite_checkin.ipynb | 687 ++++++++++++++++++ docs/source/notebooks/gfql.rst | 2 +- 2 files changed, 688 insertions(+), 1 deletion(-) create mode 100644 demos/demos_by_use_case/social/brightkite_checkin.ipynb diff --git a/demos/demos_by_use_case/social/brightkite_checkin.ipynb b/demos/demos_by_use_case/social/brightkite_checkin.ipynb new file mode 100644 index 0000000000..8470f97746 --- /dev/null +++ b/demos/demos_by_use_case/social/brightkite_checkin.ipynb @@ -0,0 +1,687 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Brightkite Location-Based Social Network Dataset\n", + "\n", + "This notebook analyzes the Brightkite dataset from SNAP:\n", + "- **Network**: 58,228 users with 214,078 friendships\n", + "- **Check-ins**: 4.4M location check-ins from April 2008 - October 2010\n", + "\n", + "Source: https://snap.stanford.edu/data/loc-brightkite.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import requests\n", + "import gzip\n", + "from io import BytesIO, StringIO\n", + "import graphistry\n", + "\n", + "# To specify Graphistry account & server, use:\n", + "# graphistry.register(api=3, protocol=\"https\", server=\"hub.graphistry.com\",\n", + "# username=\"...\", password=\"...\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download and Parse Friendship Network" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading friendship network...\n", + "Loaded 428,156 edges\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "user1", + "rawType": "int64", + "type": "integer" + }, + { + "name": "user2", + "rawType": "int64", + "type": "integer" + } + ], + "ref": "0004d8dd-b4b5-4d4f-894b-122d068d5d3b", + "rows": [ + [ + "0", + "0", + "1" + ], + [ + "1", + "0", + "2" + ], + [ + "2", + "0", + "3" + ], + [ + "3", + "0", + "4" + ], + [ + "4", + "0", + "5" + ] + ], + "shape": { + "columns": 2, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user1user2
001
102
203
304
405
\n", + "
" + ], + "text/plain": [ + " user1 user2\n", + "0 0 1\n", + "1 0 2\n", + "2 0 3\n", + "3 0 4\n", + "4 0 5" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Download friendship network\n", + "edges_url = 'https://snap.stanford.edu/data/loc-brightkite_edges.txt.gz'\n", + "print('Downloading friendship network...')\n", + "edges_response = requests.get(edges_url)\n", + "\n", + "# Decompress and parse\n", + "with gzip.GzipFile(fileobj=BytesIO(edges_response.content)) as f:\n", + " edges_content = f.read().decode('utf-8')\n", + "\n", + "# Parse into DataFrame\n", + "edges_df = pd.read_csv(\n", + " StringIO(edges_content),\n", + " sep='\\t',\n", + " comment='#',\n", + " names=['user1', 'user2'],\n", + " dtype={'user1': int, 'user2': int}\n", + ")\n", + "\n", + "print(f'Loaded {len(edges_df):,} edges')\n", + "edges_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download and Parse Check-in Data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading check-in data...\n", + "Loaded 4,491,144 check-ins\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "user", + "rawType": "int64", + "type": "integer" + }, + { + "name": "check_in_time", + "rawType": "datetime64[ns, UTC]", + "type": "unknown" + }, + { + "name": "latitude", + "rawType": "float64", + "type": "float" + }, + { + "name": "longitude", + "rawType": "float64", + "type": "float" + }, + { + "name": "location_id", + "rawType": "object", + "type": "string" + } + ], + "ref": "417ece4f-1c91-455c-b573-0114cd56f965", + "rows": [ + [ + "0", + "0", + "2010-10-17 01:48:53+00:00", + "39.747652", + "-104.99251", + "88c46bf20db295831bd2d1718ad7e6f5" + ], + [ + "1", + "0", + "2010-10-16 06:02:04+00:00", + "39.891383", + "-105.070814", + "7a0f88982aa015062b95e3b4843f9ca2" + ], + [ + "2", + "0", + "2010-10-16 03:48:54+00:00", + "39.891077", + "-105.068532", + "dd7cd3d264c2d063832db506fba8bf79" + ], + [ + "3", + "0", + "2010-10-14 18:25:51+00:00", + "39.750469", + "-104.999073", + "9848afcc62e500a01cf6fbf24b797732f8963683" + ], + [ + "4", + "0", + "2010-10-14 00:21:47+00:00", + "39.752713", + "-104.996337", + "2ef143e12038c870038df53e0478cefc" + ] + ], + "shape": { + "columns": 5, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
usercheck_in_timelatitudelongitudelocation_id
002010-10-17 01:48:53+00:0039.747652-104.99251088c46bf20db295831bd2d1718ad7e6f5
102010-10-16 06:02:04+00:0039.891383-105.0708147a0f88982aa015062b95e3b4843f9ca2
202010-10-16 03:48:54+00:0039.891077-105.068532dd7cd3d264c2d063832db506fba8bf79
302010-10-14 18:25:51+00:0039.750469-104.9990739848afcc62e500a01cf6fbf24b797732f8963683
402010-10-14 00:21:47+00:0039.752713-104.9963372ef143e12038c870038df53e0478cefc
\n", + "
" + ], + "text/plain": [ + " user check_in_time latitude longitude \\\n", + "0 0 2010-10-17 01:48:53+00:00 39.747652 -104.992510 \n", + "1 0 2010-10-16 06:02:04+00:00 39.891383 -105.070814 \n", + "2 0 2010-10-16 03:48:54+00:00 39.891077 -105.068532 \n", + "3 0 2010-10-14 18:25:51+00:00 39.750469 -104.999073 \n", + "4 0 2010-10-14 00:21:47+00:00 39.752713 -104.996337 \n", + "\n", + " location_id \n", + "0 88c46bf20db295831bd2d1718ad7e6f5 \n", + "1 7a0f88982aa015062b95e3b4843f9ca2 \n", + "2 dd7cd3d264c2d063832db506fba8bf79 \n", + "3 9848afcc62e500a01cf6fbf24b797732f8963683 \n", + "4 2ef143e12038c870038df53e0478cefc " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Download check-in data\n", + "checkins_url = 'https://snap.stanford.edu/data/loc-brightkite_totalCheckins.txt.gz'\n", + "print('Downloading check-in data...')\n", + "checkins_response = requests.get(checkins_url)\n", + "\n", + "# Decompress and parse\n", + "with gzip.GzipFile(fileobj=BytesIO(checkins_response.content)) as f:\n", + " checkins_content = f.read().decode('utf-8')\n", + "\n", + "# Parse into DataFrame\n", + "checkins_df = pd.read_csv(\n", + " StringIO(checkins_content),\n", + " sep='\\t',\n", + " comment='#',\n", + " names=['user', 'check_in_time', 'latitude', 'longitude', 'location_id'],\n", + " dtype={'user': int},\n", + " parse_dates=['check_in_time']\n", + ")\n", + "\n", + "# Filter out likely invalid coordinates: (0, 0) or missing values\n", + "checkins_df = checkins_df[\n", + " checkins_df['latitude'].notna() & \n", + " checkins_df['longitude'].notna() & \n", + " ((checkins_df['latitude'] != 0) | (checkins_df['longitude'] != 0))\n", + "]\n", + "\n", + "print(f'Loaded {len(checkins_df):,} check-ins')\n", + "checkins_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filtered edges: 428,156 -> 388,180\n", + "Users in network: 58,228\n", + "Users with valid check-ins: 50,686\n", + "Users in filtered network: 50,111\n" + ] + } + ], + "source": [ + "# Filter edges to only include users with valid check-ins\n", + "valid_users = set(checkins_df['user'].unique())\n", + "edges_df_filtered = edges_df[\n", + " edges_df['user1'].isin(valid_users) & \n", + " edges_df['user2'].isin(valid_users)\n", + "]\n", + "\n", + "print(f'Filtered edges: {len(edges_df):,} -> {len(edges_df_filtered):,}')\n", + "print(f'Users in network: {pd.concat([edges_df[\"user1\"], edges_df[\"user2\"]]).nunique():,}')\n", + "print(f'Users with valid check-ins: {len(valid_users):,}')\n", + "print(f'Users in filtered network: {pd.concat([edges_df_filtered[\"user1\"], edges_df_filtered[\"user2\"]]).nunique():,}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize Friendship Network with Graphistry\n", + "\n", + "This visualization shows the social network of Brightkite users. Each node represents a user, positioned at their first check-in location. Edges represent friendships between users.\n", + "\n", + "**What to explore:**\n", + "- Community clusters: Groups of highly connected friends\n", + "- Geographic patterns: Whether friend groups cluster geographically\n", + "- Network hubs: Users with many connections (high degree)\n", + "- Network structure: Identify isolated groups vs. the main component" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Visualize friendship network (filtered to users with valid check-ins)\n", + "# Use only first check-in per user for node positioning\n", + "\n", + "g = graphistry.edges(edges_df_filtered, 'user1', 'user2').nodes(checkins_df.groupby('user').first().reset_index(), 'user') \\\n", + " .layout_settings(play=0) \\\n", + " .settings(height=800, url_params={\"pointOpacity\": 0.6, \"edgeOpacity\": 0.01})\n", + "g.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Hypergraph: Users + Check-ins\n", + "\n", + "This hypergraph combines two types of nodes: **user nodes** (blue, at average location) and **check-in nodes** (red, at actual check-in locations). Two types of edges connect them: **friendships** (blue) between users, and **user-to-check-in** edges (red) linking users to their check-ins.\n", + "\n", + "**What to explore:**\n", + "- Mobility patterns: Check-in scatter around user's average location reveals travel behavior\n", + "- Social-spatial correlation: Do friends visit similar locations?\n", + "- Activity levels: Number of red edges from a user shows check-in frequency\n", + "- Geographic hotspots: Dense red node clusters indicate popular locations\n", + "- User movement range: Distance between user node and their check-ins shows mobility" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Target check-ins: 500,000\n", + "Original check-ins: 4,491,144\n", + "Sampled check-ins: 503,445\n", + "Sample fraction: 11.13%\n", + "Users with check-ins: 50,686\n", + "User nodes: 50,686\n", + "Check-in nodes: 503,445\n", + "Friendship edges: 388,180\n", + "User->check-in edges: 503,445\n", + "Total nodes: 554,131\n", + "Total edges: 891,625\n" + ] + } + ], + "source": [ + "# Sample check-ins: keep at least 1 per user, then randomly sample the rest\n", + "# This ensures every user has representation while reducing total nodes\n", + "\n", + "# Target number of check-ins to approximate\n", + "target_checkins = 500_000\n", + "min_per_user = 1 # At least 1 check-in per user\n", + "\n", + "# Calculate sample fraction to approximate target\n", + "sample_fraction = target_checkins / len(checkins_df)\n", + "\n", + "sampled_checkins = []\n", + "for user_id, user_data in checkins_df.groupby('user'):\n", + " n_checkins = len(user_data)\n", + " n_sample = max(min_per_user, int(n_checkins * sample_fraction))\n", + " sampled_checkins.append(user_data.sample(n=n_sample, random_state=42))\n", + "\n", + "checkins_sampled = pd.concat(sampled_checkins, ignore_index=True)\n", + "\n", + "print(f'Target check-ins: {target_checkins:,}')\n", + "print(f'Original check-ins: {len(checkins_df):,}')\n", + "print(f'Sampled check-ins: {len(checkins_sampled):,}')\n", + "print(f'Sample fraction: {sample_fraction:.2%}')\n", + "print(f'Users with check-ins: {checkins_sampled[\"user\"].nunique():,}')\n", + "\n", + "# Create aggregated user nodes with average coordinates (using ALL check-ins for accuracy)\n", + "user_nodes = checkins_df.groupby('user').agg({\n", + " 'latitude': 'mean',\n", + " 'longitude': 'mean',\n", + " 'check_in_time': 'count'\n", + "}).reset_index()\n", + "user_nodes.columns = ['user', 'avg_latitude', 'avg_longitude', 'checkin_count']\n", + "user_nodes['type'] = 'user'\n", + "user_nodes['node_id'] = 'user_' + user_nodes['user'].astype(str)\n", + "\n", + "# Create check-in nodes from SAMPLED data\n", + "checkin_nodes = checkins_sampled.copy()\n", + "checkin_nodes['type'] = 'checkin'\n", + "checkin_nodes['node_id'] = 'checkin_' + checkin_nodes.index.astype(str)\n", + "\n", + "# Create user->check-in edges\n", + "user_checkin_edges = pd.DataFrame({\n", + " 'source': 'user_' + checkin_nodes['user'].astype(str),\n", + " 'destination': checkin_nodes['node_id'],\n", + " 'type': 'user_to_checkin'\n", + "})\n", + "\n", + "# Create friendship edges between user nodes\n", + "friendship_edges = pd.DataFrame({\n", + " 'source': 'user_' + edges_df_filtered['user1'].astype(str),\n", + " 'destination': 'user_' + edges_df_filtered['user2'].astype(str),\n", + " 'type': 'friendship'\n", + "})\n", + "\n", + "# Combine all edges\n", + "all_edges = pd.concat([friendship_edges, user_checkin_edges], ignore_index=True)\n", + "\n", + "# Combine all nodes\n", + "all_nodes = pd.concat([\n", + " user_nodes[['node_id', 'avg_latitude', 'avg_longitude', 'type', 'checkin_count']].rename(\n", + " columns={'avg_latitude': 'latitude', 'avg_longitude': 'longitude'}\n", + " ),\n", + " checkin_nodes[['node_id', 'latitude', 'longitude', 'type', 'check_in_time', 'location_id']]\n", + "], ignore_index=True)\n", + "\n", + "print(f'User nodes: {len(user_nodes):,}')\n", + "print(f'Check-in nodes: {len(checkin_nodes):,}')\n", + "print(f'Friendship edges: {len(friendship_edges):,}')\n", + "print(f'User->check-in edges: {len(user_checkin_edges):,}')\n", + "print(f'Total nodes: {len(all_nodes):,}')\n", + "print(f'Total edges: {len(all_edges):,}')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create hypergraph visualization\n", + "g_hyper = graphistry.edges(all_edges, 'source', 'destination').nodes(all_nodes, 'node_id') \\\n", + " .encode_point_color(\"type\", as_categorical=True, categorical_mapping={\"checkin\": \"red\", \"user\": \"blue\"}) \\\n", + " .encode_edge_color(\"type\", as_categorical=True, categorical_mapping={\"user_to_checkin\": \"red\", \"friendship\": \"blue\"}) \\\n", + " .layout_settings(play=0) \\\n", + " .settings(height=800, url_params={\"pointOpacity\": 0.6, \"edgeOpacity\": 0.01})\n", + "g_hyper.plot()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "rapids-24.08", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/source/notebooks/gfql.rst b/docs/source/notebooks/gfql.rst index 99b18c033f..8e5f16378c 100644 --- a/docs/source/notebooks/gfql.rst +++ b/docs/source/notebooks/gfql.rst @@ -12,4 +12,4 @@ GFQL Graph queries GPU Benchmarking <../demos/gfql/benchmark_hops_cpu_gpu.ipynb> GFQL Remote mode <../demos/gfql/gfql_remote.ipynb> Python Remote mode <../demos/gfql/python_remote.ipynb> - # ICIJ FinCEN Files Visualization <../demos/demos_by_user_case/icij_fincen_viz.ipynb> + ICIJ FinCEN Files Visualization <../demos/demos_by_user_case/icij_fincen_viz.ipynb> From c2960d5631765eb8c945cba93efc9f39ac95e2ff Mon Sep 17 00:00:00 2001 From: Manfred Cheung Date: Thu, 18 Dec 2025 14:10:10 -0500 Subject: [PATCH 2/7] update brightkite checkin to add choropleth --- .../social/brightkite_checkin.ipynb | 310 +++++++++++++++--- 1 file changed, 264 insertions(+), 46 deletions(-) diff --git a/demos/demos_by_use_case/social/brightkite_checkin.ipynb b/demos/demos_by_use_case/social/brightkite_checkin.ipynb index 8470f97746..0a1a25c7d6 100644 --- a/demos/demos_by_use_case/social/brightkite_checkin.ipynb +++ b/demos/demos_by_use_case/social/brightkite_checkin.ipynb @@ -27,7 +27,7 @@ "\n", "# To specify Graphistry account & server, use:\n", "# graphistry.register(api=3, protocol=\"https\", server=\"hub.graphistry.com\",\n", - "# username=\"...\", password=\"...\")\n" + "# username=\"...\", password=\"...\")" ] }, { @@ -39,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 50, "metadata": {}, "outputs": [ { @@ -70,7 +70,7 @@ "type": "integer" } ], - "ref": "0004d8dd-b4b5-4d4f-894b-122d068d5d3b", + "ref": "8ccb118f-b78d-48ba-a603-bd767459322c", "rows": [ [ "0", @@ -165,7 +165,7 @@ "4 0 5" ] }, - "execution_count": 2, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -202,7 +202,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 51, "metadata": {}, "outputs": [ { @@ -248,7 +248,7 @@ "type": "string" } ], - "ref": "417ece4f-1c91-455c-b573-0114cd56f965", + "ref": "fce04ca6-e596-4c7b-bc34-08805db20113", "rows": [ [ "0", @@ -383,7 +383,7 @@ "4 2ef143e12038c870038df53e0478cefc " ] }, - "execution_count": 3, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -421,7 +421,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 52, "metadata": {}, "outputs": [ { @@ -466,14 +466,14 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create hypergraph visualization\n", + "g_hyper = graphistry.edges(all_edges, 'source', 'destination').nodes(all_nodes, 'node_id') \\\n", + " .encode_point_color(\"type\", as_categorical=True, categorical_mapping={\"checkin\": \"red\", \"user\": \"blue\"}) \\\n", + " .encode_edge_color(\"type\", as_categorical=True, categorical_mapping={\"user_to_checkin\": \"red\", \"friendship\": \"blue\"}) \\\n", + " .layout_settings(play=0) \\\n", + " .settings(height=800, url_params={\"pointOpacity\": 0.6, \"edgeOpacity\": 0.01})\n", + "g_hyper.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add Choropleth Map Layer\n", + "\n", + "This visualization adds a geographic choropleth layer using Kepler.gl that color-codes countries by the total number of nodes (users + check-ins) within their borders. The choropleth overlays the hypergraph to provide geographic context for network activity.\n", + "\n", + "**What to explore:**\n", + "- Country-level aggregation: Total node count per country shown via color intensity\n", + "- Color gradient interpretation: Darker (black/dark green) = minimal activity, brighter (vibrant green) = high activity\n", + "- Logarithmic binning: Each color step represents order-of-magnitude increases (1, 10, 100, 1K, 5K, 10K, 15K+)\n", + "- Geographic patterns: Compare regional concentration vs. global distribution\n", + "- Cross-reference: Match choropleth colors to underlying point clusters on the map\n", + "- Network geography: Identify where users and check-ins are concentrated globally" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Adding country information to 282,400 nodes with coordinates...\n", + "\n", + "Country distribution:\n", + "country\n", + "US 170507\n", + "JP 18165\n", + "GB 17155\n", + "AU 8031\n", + "CA 7654\n", + "DE 7360\n", + "SE 4748\n", + "NL 4442\n", + "IT 3453\n", + "FR 3157\n", + "NO 3144\n", + "ES 2818\n", + "FI 1941\n", + "CN 1866\n", + "BE 1434\n", + "CL 1322\n", + "IN 1313\n", + "BR 1307\n", + "PT 1270\n", + "CH 1207\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# Add country information using reverse_geocoder (fast, offline)\n", + "import reverse_geocoder as rg\n", + "\n", + "# Filter nodes with valid coordinates\n", + "nodes_with_coords = all_nodes[all_nodes['latitude'].notna() & all_nodes['longitude'].notna()].copy()\n", + "\n", + "print(f'Adding country information to {len(nodes_with_coords):,} nodes with coordinates...')\n", + "\n", + "# Prepare coordinates for batch reverse geocoding\n", + "coords = list(zip(nodes_with_coords['latitude'], nodes_with_coords['longitude']))\n", + "\n", + "# Batch reverse geocode (much faster than individual requests)\n", + "results = rg.search(coords)\n", + "\n", + "# Extract country codes\n", + "nodes_with_coords['country'] = [result['cc'] for result in results]\n", + "\n", + "# Merge back to all_nodes\n", + "all_nodes = all_nodes.drop(columns=['country'], errors='ignore')\n", + "all_nodes = all_nodes.merge(\n", + " nodes_with_coords[['node_id', 'country']], \n", + " on='node_id', \n", + " how='left'\n", + ")\n", + "\n", + "print('\\nCountry distribution:')\n", + "print(all_nodes['country'].value_counts().head(20))" + ] + }, + { + "cell_type": "code", + "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - " \n", - " \n", - " \n", + " \n", " " ], "text/plain": [ "" ] }, - "execution_count": 111, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -673,7 +882,7 @@ " \"edgeOpacity\": 0.3 if len(g_out._edges) > 1500 else 0.9,\n", " \"strongGravity\": True,\n", " \"play\": 2000})\n", - "g_out.plot()" + "g_out.plot(name=\"ICIJ FinCEN Full\")" ] }, { @@ -682,12 +891,12 @@ "source": [ "## Caribbean havens subgraph\n", "\n", - "The ICIJ mentions the importance of the world's top offshore financial havens in the data. Lets find transactions involving Caribbean tax havens using GFQL chain operations." + "The ICIJ mentions the importance of the world's top offshore financial havens in the data. Let's find transactions involving Caribbean tax havens using GFQL chain operations." ] }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -714,7 +923,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -727,7 +936,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -827,7 +1036,7 @@ "type": "float" } ], - "ref": "323b075d-0dcb-430e-98d6-01d1a502716d", + "ref": "98ab04a7-55ec-48da-9309-8960fe32bf8a", "rows": [ [ "0", @@ -1118,7 +1327,7 @@ "4 1.0 101000.0 " ] }, - "execution_count": 115, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1130,7 +1339,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1149,11 +1358,11 @@ }, { "name": "is_carib_bank", - "rawType": "bool", - "type": "boolean" + "rawType": "object", + "type": "unknown" } ], - "ref": "06682bbf-1f4c-46c2-bc58-1b075a11f6c2", + "ref": "157b20bf-81be-49a4-a35d-8025f71fc5c1", "rows": [ [ "0", @@ -1240,15 +1449,15 @@ "" ], "text/plain": [ - " nodeId is_carib_bank\n", - "0 hsbc-bank True\n", - "1 hsbc True\n", - "2 caledonian-bank-limited True\n", - "3 gonet-bank-and-trust-limited True\n", - "4 dms-bank-trust-ltd True" + " nodeId is_carib_bank\n", + "0 hsbc-bank True\n", + "1 hsbc True\n", + "2 caledonian-bank-limited True\n", + "3 gonet-bank-and-trust-limited True\n", + "4 dms-bank-trust-ltd True" ] }, - "execution_count": 116, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1266,7 +1475,7 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -1279,7 +1488,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -1379,7 +1588,7 @@ "type": "float" } ], - "ref": "564a8112-a18c-4102-b227-430909a639ad", + "ref": "47fce7ec-b59f-4cf2-8ce1-510bee56b55b", "rows": [ [ "0", @@ -1663,7 +1872,7 @@ "4 2.0 200000.0 " ] }, - "execution_count": 118, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1675,7 +1884,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -1694,11 +1903,11 @@ }, { "name": "is_carib_bank", - "rawType": "bool", - "type": "boolean" + "rawType": "object", + "type": "unknown" } ], - "ref": "da9991cc-04a3-469f-afcd-b2df1bee9b7c", + "ref": "83e79bc4-93a7-40e2-a489-ca39b316c75c", "rows": [ [ "0", @@ -1785,15 +1994,15 @@ "" ], "text/plain": [ - " nodeId is_carib_bank\n", - "0 hsbc-hong-kong-hkg False\n", - "1 credit-suisse-ag False\n", - "2 bsi-sa False\n", - "3 abn-amro-bank-nv False\n", - "4 deutsche-bank-ag False" + " nodeId is_carib_bank\n", + "0 hsbc-hong-kong-hkg False\n", + "1 credit-suisse-ag False\n", + "2 bsi-sa False\n", + "3 abn-amro-bank-nv False\n", + "4 deutsche-bank-ag False" ] }, - "execution_count": 119, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -1813,33 +2022,28 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - " \n", - " \n", - " \n", + " \n", " " ], "text/plain": [ "" ] }, - "execution_count": 120, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -1868,7 +2072,7 @@ " \"play\": 2000})\n", ")\n", "\n", - "g_carib_styled.plot()" + "g_carib_styled.plot(name=\"ICIJ FinCEN Caribbean\")" ] }, { @@ -1877,38 +2081,33 @@ "source": [ "## Latvia-Russia subgraph\n", "\n", - "Find all data following a specific transaction pattern: Latvia to Russia transactions in a specific amount range." + "Find all data following a specific transaction pattern: Latvia to Russia transactions." ] }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - " \n", - " \n", - " \n", + " \n", " " ], "text/plain": [ "" ] }, - "execution_count": 121, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -1918,7 +2117,7 @@ "\n", "chain_operations = [\n", " e_forward(hops=1, edge_match={\"originator_bank_country\": \"Latvia\", \"beneficiary_bank_country\": \"Russia\"}),\n", - " n({\"nodeId\": contains(pat=\"\")}, name=\"is_rus_beneficiary\"),\n", + " n(name=\"is_rus_beneficiary\"),\n", "]\n", "g_lva_rus = g.gfql(chain_operations)\n", "\n", @@ -1936,12 +2135,12 @@ " \"edgeOpacity\": 0.3 if len(g_lva_rus._edges) > 1500 else 0.9,\n", " \"strongGravity\": True,\n", " \"play\": 2000})\n", - "g_lva_rus.plot()" + "g_lva_rus.plot(name=\"ICIJ FinCEN Lva-Rus\")" ] }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -1960,11 +2159,11 @@ }, { "name": "is_rus_beneficiary", - "rawType": "bool", - "type": "boolean" + "rawType": "object", + "type": "unknown" } ], - "ref": "91b5eebd-38eb-4840-9376-a30c770e7ff0", + "ref": "1f2c7c78-4197-4380-a45b-65083c8ab2a4", "rows": [ [ "0", @@ -2051,15 +2250,15 @@ "" ], "text/plain": [ - " nodeId is_rus_beneficiary\n", - "0 latvian-trade-commercial-bank False\n", - "1 ltb-bank-riga False\n", - "2 norvik-banka-jsc False\n", - "3 jsc-norvik-banka False\n", - "4 rietumu-banka-jsc False" + " nodeId is_rus_beneficiary\n", + "0 latvian-trade-commercial-bank False\n", + "1 ltb-bank-riga False\n", + "2 norvik-banka-jsc False\n", + "3 jsc-norvik-banka False\n", + "4 rietumu-banka-jsc False" ] }, - "execution_count": 122, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -2070,7 +2269,7 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -2163,7 +2362,7 @@ "type": "float" } ], - "ref": "58886227-2ab8-48c2-b567-51fa8269bc55", + "ref": "c62d5465-337c-4c4e-91c9-e250e9e5410c", "rows": [ [ "0", @@ -2447,7 +2646,7 @@ "4 1.0 790753.42 " ] }, - "execution_count": 123, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -2472,7 +2671,7 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -2491,7 +2690,7 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -2510,11 +2709,11 @@ }, { "name": "is_soyuz", - "rawType": "bool", - "type": "boolean" + "rawType": "object", + "type": "unknown" } ], - "ref": "7846a1c3-19cc-4969-bfeb-6d49c88517d2", + "ref": "21e213e6-fb69-4c24-96df-d9d7fa87e121", "rows": [ [ "0", @@ -2571,12 +2770,12 @@ "" ], "text/plain": [ - " nodeId is_soyuz\n", - "0 as-expobank False\n", - "1 bank-soyuz-moscow-russia-rus True" + " nodeId is_soyuz\n", + "0 as-expobank False\n", + "1 bank-soyuz-moscow-russia-rus True" ] }, - "execution_count": 125, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -2587,7 +2786,7 @@ }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -2680,7 +2879,7 @@ "type": "float" } ], - "ref": "3a269bb6-f311-4b19-bd36-5676983283ff", + "ref": "caf2dc46-ab93-4f7b-b416-1c2458aad8d4", "rows": [ [ "0", @@ -2785,7 +2984,7 @@ "0 RUS 1.0 15900000.0 " ] }, - "execution_count": 126, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -2803,33 +3002,28 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - " \n", - " \n", - " \n", + " \n", " " ], "text/plain": [ "" ] }, - "execution_count": 128, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -2850,7 +3044,7 @@ " \"play\": 2000})\n", ")\n", "\n", - "g_od_styled.plot()" + "g_od_styled.plot(name=\"ICIJ FinCEN OD\")" ] }, { @@ -2863,7 +3057,7 @@ "\n", "- **Node filtering**: `n()` with attribute matching and predicates\n", "- **Edge traversal**: `e_forward()` with hop counts and edge matching\n", - "- **Chaining operations**: `graphistry.Chain()` to combine multiple operations\n", + "- **Chaining operations**: `g.gfql()` to combine multiple operations\n", "- **Predicates**:\n", " - `is_in()` for matching multiple values\n", " - `contains()` for substring matching\n", From c4674b57933e094dd2ed159d5ede4bfd0f652e90 Mon Sep 17 00:00:00 2001 From: Manfred Cheung Date: Mon, 6 Apr 2026 20:06:35 -0400 Subject: [PATCH 5/7] update and correct biogrid demo --- demos/demos_by_use_case/bio/BiogridDemo.ipynb | 609 ++++++++++++++---- 1 file changed, 468 insertions(+), 141 deletions(-) diff --git a/demos/demos_by_use_case/bio/BiogridDemo.ipynb b/demos/demos_by_use_case/bio/BiogridDemo.ipynb index 98e57ae8e9..afbab486b5 100644 --- a/demos/demos_by_use_case/bio/BiogridDemo.ipynb +++ b/demos/demos_by_use_case/bio/BiogridDemo.ipynb @@ -6,27 +6,29 @@ "source": [ "# PyGraphistry Tutorial: Visualize Protein Interactions From BioGrid\n", "\n", - "That is over 600.000 interactions across 50'000 proteins!\n", + "That is over 600 000 interactions across 50 000 proteins!\n", "\n", "##### Notes\n", "\n", - "This notebook automatically downloads about 200 MB of [BioGrid](http://thebiogrid.org) data. If you are going to run this notebook more than once, we recommend manually dowloading and saving the data to disk. To do so, unzip the two files and place their content in `pygraphistry/demos/data`.\n", - "- Protein Interactions: [BIOGRID-ALL-3.3.123.tab2.zip](http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-3.3.123/BIOGRID-ALL-3.3.123.tab2.zip)\n", - "- Protein Identifiers: [BIOGRID-IDENTIFIERS-3.3.123.tab.zip](http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-3.3.123/BIOGRID-IDENTIFIERS-3.3.123.tab.zip)\n" + "This notebook automatically downloads about 200 MB of [BioGrid](http://thebiogrid.org) data.\n", + "- Protein Interactions: [BIOGRID-ALL-5.0.252.tab2.zip](http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-5.0.252/BIOGRID-ALL-5.0.252.tab2.zip)\n", + "- Protein Identifiers: [BIOGRID-IDENTIFIERS-5.0.252.tab.zip](http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-5.0.252/BIOGRID-IDENTIFIERS-5.0.252.tab.zip)\n" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ + "import os\n", + "import requests\n", "import pandas\n", "import graphistry\n", "\n", "# To specify Graphistry account & server, use:\n", "# graphistry.register(api=3, username='...', password='...', protocol='https', server='hub.graphistry.com')\n", - "# For more options: https://pygraphistry.readthedocs.io/en/latest/server/register.html\n" + "# For more options: https://pygraphistry.readthedocs.io/en/latest/server/register.html" ] }, { @@ -39,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 16, "metadata": { "scrolled": false }, @@ -48,26 +50,107 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (19,20) have mixed types. Specify dtype option on import or set low_memory=False.\n", - " interactivity=interactivity, compiler=compiler, result=result)\n" + "/tmp/ipykernel_887036/1971870635.py:6: DtypeWarning: Columns (9,10,19,20) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " rawdata = pandas.read_table(local_path, na_values=['-'], engine='c', compression='zip')\n" ] }, { "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "BioGRID ID Interactor A", + "rawType": "int64", + "type": "integer" + }, + { + "name": "BioGRID ID Interactor B", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Official Symbol Interactor A", + "rawType": "object", + "type": "string" + }, + { + "name": "Official Symbol Interactor B", + "rawType": "object", + "type": "string" + }, + { + "name": "Pubmed ID", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Author", + "rawType": "object", + "type": "string" + }, + { + "name": "Throughput", + "rawType": "object", + "type": "string" + } + ], + "ref": "b2e86da8-9547-4b2a-8d62-e592b31af662", + "rows": [ + [ + "0", + "112315", + "108607", + "MAP2K4", + "FLNC", + "9006895", + "Marti A (1997)", + "Low Throughput" + ], + [ + "1", + "124185", + "106603", + "MYPN", + "ACTN2", + "11309420", + "Bang ML (2001)", + "Low Throughput" + ], + [ + "2", + "106605", + "108625", + "ACVR1", + "FNTA", + "8599089", + "Wang T (1996)", + "Low Throughput" + ] + ], + "shape": { + "columns": 7, + "rows": 3 + } + }, "text/html": [ "
\n", - "\n", "\n", " \n", @@ -134,17 +217,18 @@ "2 Wang T (1996) Low Throughput " ] }, - "execution_count": 2, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "url1 = 'https://s3-us-west-1.amazonaws.com/graphistry.demo.data/BIOGRID-ALL-3.3.123.tab2.txt.gz'\n", - "rawdata = pandas.read_table(url1, na_values=['-'], engine='c', compression='gzip')\n", + "local_path = './data/BIOGRID-ALL-5.0.252.tab2.zip'\n", + "if not os.path.exists(local_path):\n", + " os.makedirs(os.path.dirname(local_path), exist_ok=True)\n", + " open(local_path, 'wb').write(requests.get('https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-5.0.252/BIOGRID-ALL-5.0.252.tab2.zip').content)\n", "\n", - "# If using local data, comment the two lines above and uncomment the line below\n", - "# pandas.read_table('./data/BIOGRID-ALL-3.3.123.tab2.txt', na_values=['-'], engine='c')\n", + "rawdata = pandas.read_table(local_path, na_values=['-'], engine='c', compression='zip')\n", "\n", "cols = ['BioGRID ID Interactor A', 'BioGRID ID Interactor B', 'Official Symbol Interactor A', \n", " 'Official Symbol Interactor B', 'Pubmed ID', 'Author', 'Throughput']\n", @@ -162,23 +246,25 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - " \n", " \n", " \n", " " ], @@ -186,14 +272,15 @@ "" ] }, - "execution_count": 3, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "g = graphistry.bind(source=\"BioGRID ID Interactor A\", destination=\"BioGRID ID Interactor B\")\n", - "g.plot(interactions.sample(10000))" + "result = g.plot(interactions.sample(10000), render='ipython', name=\"Biogrid Min\")\n", + "result" ] }, { @@ -206,27 +293,68 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 18, "metadata": { "scrolled": true }, "outputs": [ { "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "BIOGRID_ID", + "rawType": "int64", + "type": "integer" + }, + { + "name": "ORGANISM", + "rawType": "object", + "type": "string" + } + ], + "ref": "f9491f39-29cf-4d7d-a0c7-867582159c64", + "rows": [ + [ + "0", + "1", + "Arabidopsis thaliana" + ], + [ + "6", + "2", + "Arabidopsis thaliana" + ], + [ + "20", + "3", + "Arabidopsis thaliana" + ] + ], + "shape": { + "columns": 2, + "rows": 3 + } + }, "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -243,12 +371,12 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -259,59 +387,92 @@ "text/plain": [ " BIOGRID_ID ORGANISM\n", "0 1 Arabidopsis thaliana\n", - "7 2 Arabidopsis thaliana\n", - "22 3 Arabidopsis thaliana" + "6 2 Arabidopsis thaliana\n", + "20 3 Arabidopsis thaliana" ] }, - "execution_count": 4, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# This downloads 170 MB, it might take some time.\n", - "url2 = 'https://s3-us-west-1.amazonaws.com/graphistry.demo.data/BIOGRID-IDENTIFIERS-3.3.123.tab.txt.gz'\n", - "raw_proteins = pandas.read_table(url2, na_values=['-'], engine='c', compression='gzip')\n", - "\n", - "# If using local data, comment the two lines above and uncomment the line below\n", - "# raw_proteins = pandas.read_table('./data/BIOGRID-IDENTIFIERS-3.3.123.tab.txt', na_values=['-'], engine='c')\n", + "local_path2 = './data/BIOGRID-IDENTIFIERS-5.0.252.tab.zip'\n", + "if not os.path.exists(local_path2):\n", + " os.makedirs(os.path.dirname(local_path2), exist_ok=True)\n", + " open(local_path2, 'wb').write(requests.get('https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-5.0.252/BIOGRID-IDENTIFIERS-5.0.252.tab.zip').content)\n", "\n", + "raw_proteins = pandas.read_table(local_path2, na_values=['-'], engine='c', compression='zip', skiprows=28)\n", "\n", "protein_ids = raw_proteins[['BIOGRID_ID', 'ORGANISM_OFFICIAL_NAME']].drop_duplicates() \\\n", " .rename(columns={'ORGANISM_OFFICIAL_NAME': 'ORGANISM'})\n", "protein_ids[:3]" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We extract the proteins referenced as either sources or targets of interactions." - ] - }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 19, "metadata": { "scrolled": true }, "outputs": [ { "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "BIOGRID_ID", + "rawType": "int64", + "type": "integer" + }, + { + "name": "SYMBOL", + "rawType": "object", + "type": "string" + } + ], + "ref": "80b3371f-aa92-44c9-af6b-1f2a68588df2", + "rows": [ + [ + "0", + "112315", + "MAP2K4" + ], + [ + "1", + "124185", + "MYPN" + ], + [ + "2", + "106605", + "ACVR1" + ] + ], + "shape": { + "columns": 2, + "rows": 3 + } + }, "text/html": [ "
\n", - "\n", "
Arabidopsis thaliana
762Arabidopsis thaliana
22203Arabidopsis thaliana
\n", " \n", @@ -348,7 +509,7 @@ "2 106605 ACVR1" ] }, - "execution_count": 5, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -366,34 +527,76 @@ "all_proteins[:3]" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We join on the indentification DB to get the organism in which each protein belongs." - ] - }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 20, "metadata": {}, "outputs": [ { "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "BIOGRID_ID", + "rawType": "int64", + "type": "integer" + }, + { + "name": "SYMBOL", + "rawType": "object", + "type": "string" + }, + { + "name": "ORGANISM", + "rawType": "object", + "type": "string" + } + ], + "ref": "d7938a83-d8bc-4064-a0bc-14b07f744f7b", + "rows": [ + [ + "0", + "112315", + "MAP2K4", + "Homo sapiens" + ], + [ + "1", + "124185", + "MYPN", + "Homo sapiens" + ], + [ + "2", + "106605", + "ACVR1", + "Homo sapiens" + ] + ], + "shape": { + "columns": 3, + "rows": 3 + } + }, "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -434,7 +637,7 @@ "2 106605 ACVR1 Homo sapiens" ] }, - "execution_count": 6, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -453,42 +656,99 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "colors = protein_labels.ORGANISM.unique().tolist()\n", - "protein_labels['Color'] = protein_labels.ORGANISM.map(lambda x: colors.index(x))" + "protein_labels['Color'] = protein_labels.ORGANISM.map(lambda x: colors.index(x)).astype('int32')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "For convenience, let's add links to PubMed and RCSB." + "For convenience, let's add links to PubMed and Biogrid." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "BIOGRID_ID", + "rawType": "int64", + "type": "integer" + }, + { + "name": "SYMBOL", + "rawType": "object", + "type": "string" + }, + { + "name": "ORGANISM", + "rawType": "object", + "type": "string" + }, + { + "name": "Color", + "rawType": "int32", + "type": "integer" + } + ], + "ref": "b6bad508-716f-452f-832b-60e22d909a95", + "rows": [ + [ + "0", + "112315", + "MAP2K4", + "Homo sapiens", + "0" + ], + [ + "1", + "124185", + "MYPN", + "Homo sapiens", + "0" + ], + [ + "2", + "106605", + "ACVR1", + "Homo sapiens", + "0" + ] + ], + "shape": { + "columns": 4, + "rows": 3 + } + }, "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -504,21 +764,21 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -528,9 +788,9 @@ ], "text/plain": [ " BIOGRID_ID SYMBOL \\\n", - "0 112315 %s' % (url, id.upper())\n", - " else:\n", - " return 'n/a'\n", + "def makeBiogridLink(row):\n", + " if pandas.notna(row.get('BIOGRID_ID')):\n", + " url = f'https://thebiogrid.org/{row[\"BIOGRID_ID\"]}'\n", + " return f'{row[\"SYMBOL\"]}'\n", + " return 'n/a'\n", " \n", - "protein_labels.SYMBOL = protein_labels.SYMBOL.map(makeRcsbLink)\n", + "protein_labels.SYMBOL = protein_labels.apply(makeBiogridLink, axis=1)\n", "protein_labels[:3]" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 23, "metadata": { "scrolled": true }, "outputs": [ { "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "BioGRID ID Interactor A", + "rawType": "int64", + "type": "integer" + }, + { + "name": "BioGRID ID Interactor B", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Official Symbol Interactor A", + "rawType": "object", + "type": "string" + }, + { + "name": "Official Symbol Interactor B", + "rawType": "object", + "type": "string" + }, + { + "name": "Pubmed ID", + "rawType": "object", + "type": "string" + }, + { + "name": "Author", + "rawType": "object", + "type": "string" + }, + { + "name": "Throughput", + "rawType": "object", + "type": "string" + } + ], + "ref": "15384706-11a1-4564-91be-8f3149642949", + "rows": [ + [ + "0", + "112315", + "108607", + "MAP2K4", + "FLNC", + "9006895", + "Marti A (1997)", + "Low Throughput" + ], + [ + "1", + "124185", + "106603", + "MYPN", + "ACTN2", + "11309420", + "Bang ML (2001)", + "Low Throughput" + ], + [ + "2", + "106605", + "108625", + "ACVR1", + "FNTA", + "8599089", + "Wang T (1996)", + "Low Throughput" + ] + ], + "shape": { + "columns": 7, + "rows": 3 + } + }, "text/html": [ "
\n", - "\n", "
0112315<a target=\"_blank\" href=\"http://www.rcsb.org/p...<a target=\"_blank\" href=\"https://thebiogrid.or...Homo sapiens0
1124185<a target=\"_blank\" href=\"http://www.rcsb.org/p...<a target=\"_blank\" href=\"https://thebiogrid.or...Homo sapiens0
2106605<a target=\"_blank\" href=\"http://www.rcsb.org/p...<a target=\"_blank\" href=\"https://thebiogrid.or...Homo sapiens0
\n", " \n", @@ -649,7 +989,7 @@ "2 Low Throughput " ] }, - "execution_count": 9, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -673,32 +1013,27 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 24, "metadata": { "scrolled": false }, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Uploading 7139 kB. This may take a while...\n" - ] - }, { "data": { "text/html": [ "\n", - " \n", " \n", " \n", " " ], @@ -706,30 +1041,22 @@ "" ] }, - "execution_count": 10, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# This will upload ~10MB of data, be patient!\n", - "g2 = g.bind(node='BIOGRID_ID', edge_title='Author', point_title='SYMBOL', point_color='Color')\n", - "g2.plot(interactions, protein_labels)" + "g2 = g.bind(node='BIOGRID_ID', source=\"BioGRID ID Interactor A\", destination=\"BioGRID ID Interactor B\", \n", + " edge_title='Author', point_title='SYMBOL', point_color='Color')\n", + "g2.plot(interactions.drop(columns=['Pubmed ID']), protein_labels, name=\"Biogrid Labeled\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "rapids-26.02", "language": "python", "name": "python3" }, @@ -743,7 +1070,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.11" + "version": "3.11.14" } }, "nbformat": 4, From e791834c83dedda7fc1a8ccf63ccea9555b8dba9 Mon Sep 17 00:00:00 2001 From: Manfred Cheung Date: Mon, 6 Apr 2026 20:23:29 -0400 Subject: [PATCH 6/7] improve brightkite demo --- .../social/brightkite_checkin.ipynb | 282 ++++++++++++------ 1 file changed, 194 insertions(+), 88 deletions(-) diff --git a/demos/demos_by_use_case/social/brightkite_checkin.ipynb b/demos/demos_by_use_case/social/brightkite_checkin.ipynb index 0a1a25c7d6..10440e77f5 100644 --- a/demos/demos_by_use_case/social/brightkite_checkin.ipynb +++ b/demos/demos_by_use_case/social/brightkite_checkin.ipynb @@ -39,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -70,7 +70,7 @@ "type": "integer" } ], - "ref": "8ccb118f-b78d-48ba-a603-bd767459322c", + "ref": "1a98570c-5c5f-4e82-ada8-00a4f6b03c3f", "rows": [ [ "0", @@ -165,7 +165,7 @@ "4 0 5" ] }, - "execution_count": 50, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -202,7 +202,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -248,7 +248,7 @@ "type": "string" } ], - "ref": "fce04ca6-e596-4c7b-bc34-08805db20113", + "ref": "b10f3f43-b7a0-44a1-9910-421dba902b50", "rows": [ [ "0", @@ -383,7 +383,7 @@ "4 2ef143e12038c870038df53e0478cefc " ] }, - "execution_count": 51, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -421,7 +421,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -433,6 +433,125 @@ "Users with valid check-ins: 50,686\n", "Users in filtered network: 50,111\n" ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "user1", + "rawType": "int64", + "type": "integer" + }, + { + "name": "user2", + "rawType": "int64", + "type": "integer" + } + ], + "ref": "0f09b4e8-d1c2-42d3-92f8-0185023fc0c7", + "rows": [ + [ + "0", + "0", + "1" + ], + [ + "1", + "0", + "2" + ], + [ + "2", + "0", + "3" + ], + [ + "3", + "0", + "4" + ], + [ + "4", + "0", + "5" + ] + ], + "shape": { + "columns": 2, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user1user2
001
102
203
304
405
\n", + "
" + ], + "text/plain": [ + " user1 user2\n", + "0 0 1\n", + "1 0 2\n", + "2 0 3\n", + "3 0 4\n", + "4 0 5" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -446,7 +565,8 @@ "print(f'Filtered edges: {len(edges_df):,} -> {len(edges_df_filtered):,}')\n", "print(f'Users in network: {pd.concat([edges_df[\"user1\"], edges_df[\"user2\"]]).nunique():,}')\n", "print(f'Users with valid check-ins: {len(valid_users):,}')\n", - "print(f'Users in filtered network: {pd.concat([edges_df_filtered[\"user1\"], edges_df_filtered[\"user2\"]]).nunique():,}')" + "print(f'Users in filtered network: {pd.concat([edges_df_filtered[\"user1\"], edges_df_filtered[\"user2\"]]).nunique():,}')\n", + "edges_df_filtered.head()" ] }, { @@ -466,33 +586,28 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - " \n", - " \n", - " \n", + " \n", " " ], "text/plain": [ "" ] }, - "execution_count": 53, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -504,7 +619,7 @@ "g = graphistry.edges(edges_df_filtered, 'user1', 'user2').nodes(checkins_df.groupby('user').first().reset_index(), 'user') \\\n", " .layout_settings(play=0) \\\n", " .settings(height=800, url_params={\"pointOpacity\": 0.6, \"edgeOpacity\": 0.01})\n", - "g.plot()" + "g.plot(name=\"Brightkite Basic\")" ] }, { @@ -525,32 +640,32 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Max check-ins per user: 6\n", + "Max check-ins per user: 10\n", "Original check-ins: 4,491,144\n", - "Sampled check-ins: 231,714\n", + "Sampled check-ins: 343,587\n", "Users with check-ins: 50,686\n", "User nodes: 50,686\n", - "Check-in nodes: 231,714\n", + "Check-in nodes: 343,587\n", "Friendship edges: 388,180\n", - "User->check-in edges: 231,714\n", - "Total nodes: 282,400\n", - "Total edges: 619,894\n" + "User->check-in edges: 343,587\n", + "Total nodes: 394,273\n", + "Total edges: 731,767\n" ] } ], "source": [ "# Sample check-ins using per-user cap for fair representation\n", - "# Users with ≤6 check-ins: keep all\n", - "# Users with >6 check-ins: randomly sample 6\n", + "# Users with ≤10 check-ins: keep all\n", + "# Users with >10 check-ins: randomly sample 10\n", "\n", - "max_per_user = 6 # Maximum check-ins per user\n", + "max_per_user = 10 # Maximum check-ins per user\n", "\n", "checkins_sampled = checkins_df.groupby('user', group_keys=False)[checkins_df.columns].apply(\n", " lambda x: x if len(x) <= max_per_user else x.sample(n=max_per_user, random_state=42)\n", @@ -612,33 +727,28 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - " \n", - " \n", - " \n", + " \n", " " ], "text/plain": [ "" ] }, - "execution_count": 55, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -650,7 +760,7 @@ " .encode_edge_color(\"type\", as_categorical=True, categorical_mapping={\"user_to_checkin\": \"red\", \"friendship\": \"blue\"}) \\\n", " .layout_settings(play=0) \\\n", " .settings(height=800, url_params={\"pointOpacity\": 0.6, \"edgeOpacity\": 0.01})\n", - "g_hyper.plot()" + "g_hyper.plot(name=\"Brightkite Map\")" ] }, { @@ -672,37 +782,38 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Adding country information to 282,400 nodes with coordinates...\n", + "Adding country information to 394,273 nodes with coordinates...\n", + "Loading formatted geocoded file...\n", "\n", "Country distribution:\n", "country\n", - "US 170507\n", - "JP 18165\n", - "GB 17155\n", - "AU 8031\n", - "CA 7654\n", - "DE 7360\n", - "SE 4748\n", - "NL 4442\n", - "IT 3453\n", - "FR 3157\n", - "NO 3144\n", - "ES 2818\n", - "FI 1941\n", - "CN 1866\n", - "BE 1434\n", - "CL 1322\n", - "IN 1313\n", - "BR 1307\n", - "PT 1270\n", - "CH 1207\n", + "US 238806\n", + "JP 26953\n", + "GB 23758\n", + "AU 11320\n", + "CA 10310\n", + "DE 10213\n", + "SE 6749\n", + "NL 6291\n", + "IT 4759\n", + "NO 4329\n", + "FR 4309\n", + "ES 3890\n", + "FI 2651\n", + "CN 2477\n", + "BE 1900\n", + "CL 1855\n", + "IN 1732\n", + "BR 1685\n", + "CH 1682\n", + "PT 1588\n", "Name: count, dtype: int64\n" ] } @@ -739,33 +850,28 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - " \n", - " \n", - " \n", + " \n", " " ], "text/plain": [ "" ] }, - "execution_count": 57, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -877,7 +983,7 @@ " .encode_kepler(kepler_ps_encoding) \\\n", " .layout_settings(play=0) \\\n", " .settings(height=800, url_params={\"pointOpacity\": 0.6, \"edgeOpacity\": 0.01})\n", - "g_hyper.plot()" + "g_hyper.plot(name=\"Brightkite Choropleth\")" ] } ], From c4c31ac5b62fca26fac3cfda224371ea4284cbf9 Mon Sep 17 00:00:00 2001 From: Manfred Cheung Date: Tue, 7 Apr 2026 13:46:42 -0400 Subject: [PATCH 7/7] add brighkite to notebooks list --- docs/source/notebooks/visualization.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/notebooks/visualization.rst b/docs/source/notebooks/visualization.rst index 44e9cc48f5..08081d0940 100644 --- a/docs/source/notebooks/visualization.rst +++ b/docs/source/notebooks/visualization.rst @@ -23,7 +23,8 @@ Geographic (Kepler.gl) :caption: Geographic visualization with Kepler.gl :titlesonly: - Geospatial Network Visualization <../demos/more_examples/graphistry_features/layout_map.ipynb> + Company Networks on a Map <../demos/more_examples/graphistry_features/layout_map.ipynb> + Brightkite Check-ins <../demos/demos_by_use_case/social/brightkite_checkin.ipynb> Layout -------