# Load data manipulation libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy.stats import gaussian_kde
# Load plotly visualization modules
import plotly.express as px
import plotly.graph_objs as go
# Load bokeh visualization modules
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, CategoricalColorMapper, HoverTool, FactorRange
from bokeh.layouts import gridplot
from bokeh.transform import factor_cmap
from bokeh.palettes import Spectral6, Category10
output_notebook()
# Load altair visualization library
import altair as alt
# Load penguins data
from palmerpenguins import load_penguins
= load_penguins().dropna() penguins
Interactive Data Visualization with Python
Interactive figures are an essential tool for communicating data insights, in particular in reports or dashboards. In this blog post, I compare different libraries for dynamic data visualization in Python. Before we dive into the comparison, here is a quick introduction to each contestant.
plotly
is an interactive, open-source plotting library that enable the creation of publication-quality figures. It supports a wide range of chart types including line charts, scatter plots, bar charts, pie charts, bubble charts, heatmaps, and more advanced visualizations like 3D plots and geographical maps. One of the key features of plotly
is its ability to produce interactive plots that users can zoom, pan, and hover over, providing tooltips and additional information, which makes it highly effective for data exploration and presentation.
bokeh
is a powerful, flexible library for creating interactive plots and dashboards in the web browser. It is designed to help users create elegant, concise constructions of versatile graphics with high-performance interactivity over very large or streaming datasets. One of the core features of bokeh
is its ability to generate dynamic javascript plots directly from Python code, which means you can harness the interactivity of web technologies without needing to write any javascript yourself. The plots can be embedded in HTML pages or served as standalone applications, making it a versatile choice for web development and data analysis tasks.
altair
is a declarative statistical visualization library, designed to create interactive visualizations with a minimal amount of code. It is built on top of the powerful Vega and Vega-Lite visualization grammars, enabling the construction of a wide range of statistical plots with a simple and intuitive syntax. One of the key advantages of altair
is its emphasis on data-driven visualization design. By allowing users to think about their data first and foremost, Altair facilitates the exploration and altair
understanding of complex datasets.
I compare code to generate plotly
, bokeh
, and altair
output in the post below. The types of plots that I chose for the comparison heavily draw on the examples given in R for Data Science - an amazing resource if you want to get started with data visualization. Spoiler alert: I’m not always able to replicate the same figure with all approaches (yet).
Loading libraries and data
We start by loading a dew data manipulation libraries, the main libraries and modules of interest, and palmerpenguins
data. We then use the penguins
data frame as the data to compare all functions and methods below. Note that I drop all rows with missing values because I don’t want to get into related messages in this post.
A full-blown example
Let’s start with an advanced example that combines many different aesthetics at the same time: we plot two columns against each other, use color and shape aesthetics do differentiate species, include separate regression lines for each species, manually set nice labels, and use a theme. You can click through the results in the tabs below.
Note that we have to manually add regression lines for plotly
and bokeh
, while altair
has built-in support for them.
= (px.scatter(
fig_full = "bill_length_mm", y = "flipper_length_mm",
penguins, x = "species", symbol = "species",
color = "Bill length vs. flipper length",
title = {"bill_length_mm": "Bill length (mm)",
labels "flipper_length_mm": "Flipper length (mm)",
"species": "Species"})
= dict(size = 10))
.update_traces(marker
.update_layout(= "white",
plot_bgcolor = dict(zeroline = False, ticklen = 5),
xaxis = dict(zeroline = False, ticklen = 5))
yaxis
)
for species in penguins["species"].unique():
= penguins[penguins["species"] == species]
penguins_subset = penguins_subset["bill_length_mm"]
X = sm.add_constant(X)
X = penguins_subset["flipper_length_mm"]
y
= sm.OLS(y, X).fit()
model = model.params[0] + model.params[1] * penguins_subset["bill_length_mm"]
line
fig_full.add_trace(= penguins_subset["bill_length_mm"], y = line,
go.Scatter(x = "lines", showlegend = False)
mode
)
fig_full
= figure(
fig_full = "Bill length vs. flipper length",
title = "Bill length (mm)", y_axis_label = "Flipper length (mm)",
x_axis_label = "pan,wheel_zoom,box_zoom,reset,hover",
tools = [
tooltips "Bill length (mm)", "@bill_length_mm"),
("Flipper length (mm)", "@flipper_length_mm"),
("Species", "@species")
(
]
)
= penguins["species"].unique()
species = {
color_map for species, color in zip(species, ["red", "green", "blue"])
species: color
}
for species in species:
= penguins[penguins["species"] == species]
penguins_subset
fig_full.scatter(= ColumnDataSource(penguins_subset),
source = "bill_length_mm", y = "flipper_length_mm",
x = species, color = color_map[species],
legend_label = 10, fill_alpha = 0.6
size
)
= penguins_subset["bill_length_mm"]
X = sm.add_constant(X)
X = penguins_subset["flipper_length_mm"]
y
= sm.OLS(y, X).fit()
model = model.predict(X)
predictions
fig_full.line("bill_length_mm"], predictions,
penguins_subset[=color_map[species], line_width = 2, legend_label = species
color
)
= "Species"
fig_full.legend.title = "top_left"
fig_full.legend.location = "white"
fig_full.background_fill_color = "white"
fig_full.border_fill_color = None
fig_full.outline_line_color
show(fig_full)
= (alt.Chart(penguins)
points =100, filled=True)
.mark_point(size
.encode(= alt.X("bill_length_mm",
x = alt.Scale(zero = False),
scale = "Bill length (mm)",
title = alt.Axis(tickCount = 5, grid = False)),
axis = alt.Y("flipper_length_mm",
y = alt.Scale(zero = False),
scale = "Flipper length (mm)",
title = alt.Axis(tickCount = 5, grid = False)),
axis = "species:N", shape = "species:N",
color = [alt.Tooltip("bill_length_mm", title = "Bill length (mm)"),
tooltip "flipper_length_mm", title = "Flipper length (mm)"),
alt.Tooltip("species", title = "Species")])
alt.Tooltip(
)
= (alt.Chart(penguins)
regression_lines
.transform_regression("bill_length_mm", "flipper_length_mm", groupby = ["species"]
)
.mark_line()
.encode(= "bill_length_mm:Q", y = "flipper_length_mm:Q",
x = "species:N"
color
)
)
= ((points + regression_lines)
fig_full = "Bill length vs. flipper length")
.properties(title = "transparent", fill = "white")
.configure_view(stroke = 10, titleFontSize = 12)
.configure_axis(labelFontSize
)
fig_full
Visualizing distributions
A categorical variable
Let’s break down the differences in smaller steps by focusing on simpler examples. If you have a categorical variable and want to compare its relevance in your data, then bar charts are your friends. The code chunks below show you how to implement them for each approach.
= (penguins
island_counts "island")
.groupby(
.size()= "n")
.reset_index(name
)
= "island", y = "n")
(px.bar(island_counts, x = "stack")
.update_layout(barmode )
= (penguins
island_counts "island")
.groupby(
.size()= "n")
.reset_index(name
)
= island_counts["island"].unique()
islands
= figure(x_range = islands,
fig_bar = "pan,wheel_zoom,box_zoom,reset,hover")
tools
= ColumnDataSource(island_counts),
fig_bar.vbar(source = "island", top = "n", width = 0.9, line_color = "white")
x show(fig_bar)
= (penguins
island_counts "island")
.groupby(
.size()= "n")
.reset_index(name
)
(alt.Chart(island_counts)
.mark_bar()= "island", y = "n",
.encode(x = ["island", "n"])
tooltip )
A numerical variable
If you have a numerical variable, usually histograms are a good starting point to get a better feeling for the distribution of your data. You can quickly create histograms in plotly
and altair
, while you have to manually construct the the histogram from individual bars in bokeh
.
= "bill_length_mm")
(px.histogram(penguins, x = dict(size = 2))
.update_traces(xbins )
= 2
bin_size = np.arange(
bins = penguins["bill_length_mm"].min(),
start = penguins["bill_length_mm"].max() + bin_size,
stop = bin_size
step
)= np.histogram(penguins["bill_length_mm"], bins = bins)
hist, edges
= ColumnDataSource(
source = dict(top = hist, left = edges[:-1], right = edges[1:])
data
)
= figure(tools = "pan,wheel_zoom,box_zoom,reset,hover")
fig_histogram
fig_histogram.quad(= source,
source = 0, top = "top", left = "left", right = "right",
bottom = "skyblue", line_color = "white"
fill_color
)
show(fig_histogram)
(alt.Chart(penguins)
.mark_bar()
.encode(= alt.X("bill_length_mm:Q", bin = alt.Bin(step = 2)),
x = alt.Y("count()"),
y = [alt.Tooltip("bill_length_mm:Q", bin = alt.Bin(step = 2)),
tooltip "count()")]
alt.Tooltip(
) )
Visualizing relationships
A numerical and a categorical variable
To visualize relationships, you need to have at least two columns. If you have a numerical and a categorical variable, then histograms or densities with groups are a good starting point. The next example illustrates the use of densities. plotly
and altair
have built-in support for densities, while you have to manually compute the densities in bokeh
.
= "body_mass_g", color = "species",
(px.histogram(penguins, x = "density", barmode = "overlay", opacity = 0.5)
histnorm = 0.75)
.update_traces(marker_line_width )
/Users/krise/Documents/GitHub/tidy-intelligence/blog/renv/python/virtualenvs/renv-python-3.10/lib/python3.10/site-packages/plotly/express/_core.py:2065: FutureWarning:
When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
= penguins["species"].unique()
species
= Category10[len(species)]
colors
= figure(tools = "pan,wheel_zoom,box_zoom,reset,hover")
fig_densities
for j, species in enumerate(species):
= penguins[penguins["species"] == species]
penguins_subset = penguins_subset["body_mass_g"].dropna()
body_mass
= gaussian_kde(body_mass)
kde = np.linspace(body_mass.min(), body_mass.max(), 100)
x_range = kde(x_range)
density
= ColumnDataSource(
source = dict(body_mass_g = x_range, density = density, species = [species]*len(x_range))
data
)
fig_densities.patch(= source,
source = "body_mass_g", y = "density",
x = 0.5, color = colors[j], legend_label = species
alpha
)
show(fig_densities)
(alt.Chart(penguins)"body_mass_g",
.transform_density(= ["body_mass_g", "density"], groupby = ["species"])
as_ = 0.5)
.mark_area(opacity
.encode(= alt.X("body_mass_g:Q"), y = alt.Y("density:Q"),
x = "species:N", tooltip = ["species:N", "body_mass_g:Q"]
color
) )
Two categorical columns
Stacked bar plots are a good way to display the relationship between two categorical columns. For plotly
and altair
, we simply compute the percentages by species and island and put them into the bar plotting functions. Note that bokeh
is peculiar because it requires the data in wide format for stacked bar charts.
= (penguins
species_island_counts "species", "island"])
.groupby([
.size()= "n")
.reset_index(name
.assign(= lambda x: x["n"] / x.groupby("species")["n"].transform("sum")
percentage
)
)
= "species", y = "percentage",
px.bar(species_island_counts, x = "island", barmode = "stack") color
/Users/krise/Documents/GitHub/tidy-intelligence/blog/renv/python/virtualenvs/renv-python-3.10/lib/python3.10/site-packages/plotly/express/_core.py:2065: FutureWarning:
When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
= (penguins
species_island_counts "species", "island"])
.groupby([
.size()= "n")
.reset_index(name
.assign(= lambda x: x["n"] / x.groupby("species")["n"].transform("sum")
percentage
)
)
= (species_island_counts
species_island_counts_wide = "species", columns = "island", values = "percentage")
.pivot(index 0)
.fillna(
)
= figure(x_range = penguins["species"].unique())
fig_stacked
fig_stacked .vbar_stack(= ColumnDataSource(data = species_island_counts_wide),
source = penguins["island"].unique(), x = "species",
stackers = 0.9, color = ["red", "blue", "green"]
width
)
show(fig_stacked)
= (penguins
species_island_counts "species", "island"])
.groupby([
.size()= "n")
.reset_index(name
.assign(= lambda x: x["n"] / x.groupby("species")["n"].transform("sum")
percentage
)
)
(alt.Chart(species_island_counts)
.mark_bar()
.encode(= "species", y = "percentage", color = "island",
x = alt.Order("island", sort = "ascending"),
order = ["species", "island", "percentage"]
tooltip
) )
Two numerical columns
Scatter plots and regression lines are definitely the most common approach for visualizing the relationship between two numerical columns and we focus on scatter plots for this example (see the first visualization example if you want to see again how to add a regression line). Note that altair
axis ranges by default includes 0,so you need to manually tell the scale to ignore it.
= "bill_length_mm", y = "flipper_length_mm")
(px.scatter(penguins, x = dict(size = 10))
.update_traces(marker )
= figure(tools = "pan,wheel_zoom,box_zoom,reset,hover")
fig_scatter
fig_scatter.circle(= ColumnDataSource(penguins),
source = "bill_length_mm", y = "flipper_length_mm",
x = 10
size
)
show(fig_scatter)
(alt.Chart(penguins)= 100)
.mark_circle(size
.encode(= alt.X("bill_length_mm", scale = alt.Scale(zero = False)),
x = alt.Y("flipper_length_mm", scale = alt.Scale(zero = False)),
y = ["bill_length_mm", "flipper_length_mm"]
tooltip
) )
Three or more columns
You can include more information by mapping columns to additional aesthetics. For instance, we can map colors and shapes to species and create separate plots for each island by using facets. Facets are actually a great way to extend your figures, so I highly recommend playing around with them using your own data.
Facets in bokeh
involve a more manual process because it doesn’t have a direct equivalent of plotly
’s facet_col
parameter or altair
’s facet()
method. Instead, you’ll create individual plots for each facet and arrange them in a grid, which also means that you cannot have an automatically shared legend.
px.scatter(
penguins, = "bill_length_mm", y = "flipper_length_mm",
x = "species", facet_col = "island"
color )
= penguins["island"].unique()
islands = penguins["species"].unique()
species
= CategoricalColorMapper(
color_mapper = species, palette = ["red", "green", "blue"]
factors
)
= []
plots for island in islands:
= penguins[penguins["island"] == island]
penguins_subset
= figure(tools="pan,wheel_zoom,box_zoom,reset",
p = 250, height = 250)
width
= "bill_length_mm", y = "flipper_length_mm",
p.circle(x = ColumnDataSource(penguins_subset),
source = {"field": "species", "transform": color_mapper},
color = "species", size = 8)
legend_field
plots.append(p)
= gridplot(plots, ncols = 3)
fig_grid
show(fig_grid)
(alt.Chart(penguins)
.mark_circle()
.encode(= alt.X("bill_length_mm", scale = alt.Scale(zero = False)),
x = alt.Y("flipper_length_mm", scale = alt.Scale(zero = False)),
y = ["bill_length_mm", "flipper_length_mm"],
tooltip = "species:N"
color
)= "island:N")
.facet(column )
Time series
As a last example, we quickly dive into time series plots where you typically show multiple lines over some date vector. Here, I aggregate the number of penguins by year and island and plot the corresponding lines. While you can simply define colors and line types in plotly
and altair
plotting functions, you have to manually loop in bokeh
.
= (penguins
year_island_count "year", "island"])
.groupby([
.size()= "n")
.reset_index(name
)
px.line(year_island_count, = "year", y = "n",
x = "island", line_shape = "linear", line_dash = "island") color
= year_island_count["island"].unique()
islands = ["blue", "green", "red"]
colors = ["solid", "dashed", "dotdash"]
dashes
= figure(tools = "pan,wheel_zoom,box_zoom,reset,hover")
fig_time_series
for j, island in enumerate(islands):
= year_island_count[
year_island_count_subset "island"] == island
year_island_count[
]
fig_time_series.line(= ColumnDataSource(year_island_count_subset),
source = "year", y = "n",
x = island,
legend_label = colors[j % len(colors)],
color = dashes[j % len(dashes)], line_width = 2
line_dash
)
show(fig_time_series)
(alt.Chart(year_island_count)
.mark_line()
.encode(= "year:T", y = "n:Q",
x = "island:N", strokeDash = "island:N",
color = ["year", "n", "island"]
tooltip
) )
Conclusion
plotly
, bokeh
, and altair
each cater to distinct visualization needs in Python. plotly
shines with its interactive, high-quality visuals and ease of embedding in web applications, making it ideal for creating complex interactive charts and dashboards. bokeh
is focused on real-time data visualizations and interactivity, particularly suited for web apps that require dynamic data streaming. Its strength lies in the seamless integration of Python code with web technologies. altair
offers a declarative approach, emphasizing simplicity and efficiency in creating elegant statistical visualizations with minimal code, making it ideal for exploratory data analysis in notebooks.