milkbox.net

Using pandas I’ve been doing some data analysis and simple data exploration; most recently I wanted to build a curve for test score distributions. Since I do like the way plots using ggplot2 look—-yes that whole package is better but I <3 Python—-I took an opportunity to try out some code posted by Bicubic to style my MatPlotLib plots.

While doing all this I figured out how to use Gaussian Kernel Density Estimation to make my histograms smooth. A question on Stack Overflow provided the bulk of the code and instructions on how to adjust the covariance_factor of the gaussian_kde class provided by the scipy stats module.

covariance_factor = .5 covariance_factor = .25

Pitfalls

One thing to note is that the gaussian_kde function requires floating point numbers.

Code to generate the graphs is:

import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde


# bogus data
data = [1.5]*7 + [2.5]*2 + [3.5]*8 + [4.5]*3 + [5.5]*1 + [6.5]*8

# generated a density class
density = gaussian_kde(data)

# set the covariance_factor, lower means more detail
density.covariance_factor = lambda : .25
density._compute_covariance()


# generate a fake range of x values
xs = np.arange(0,24,.1)

# fill y values using density class
ys = density(xs)


# ggaxes is a wrapper around rstyle
ax = ggaxes(plt.figure(figsize=(10,8)))

l = ax.plot(xs, ys, antialiased=True, linewidth=2, color="#A81450")
l = ax.fill_between(xs, ys, alpha=.5, zorder=5, antialiased=True, color="#E01B6A")

Series(data).hist(ax=ax, normed=1, bins=8, color='grey', antialiased=True)
ax.set_xlim(0,8)

plt.savefig("gaussian_kde_25.png")

The code above requires the following function:

from rstyle import rstyle

def ggaxes(fig=None):
    if fig is None: fig = plt.figure()
    ax = fig.add_subplot(111)
    rstyle(ax)
    return ax

rstyle is the function to style matplotlib like ggplot2 (originally located here) which I keep in the file rstyle.py somewhere in my PYTHONPATH:

from pylab import *

def rstyle(ax):
    """Styles an axes to appear like ggplot2
    Must be called after all plot and axis manipulation operations have been carried out (needs to know final tick spacing)
    """
    #set the style of the major and minor grid lines, filled blocks
    ax.grid(True, 'major', color='w', linestyle='-', linewidth=1.4)
    ax.grid(True, 'minor', color='0.92', linestyle='-', linewidth=0.7)
    ax.patch.set_facecolor('0.85')
    ax.set_axisbelow(True)

    #set minor tick spacing to 1/2 of the major ticks
    ax.xaxis.set_minor_locator(MultipleLocator( (plt.xticks()[0][1]-plt.xticks()[0][0]) / 2.0 ))
    ax.yaxis.set_minor_locator(MultipleLocator( (plt.yticks()[0][1]-plt.yticks()[0][0]) / 2.0 ))

    #remove axis border
    for child in ax.get_children():
        if isinstance(child, matplotlib.spines.Spine):
            child.set_alpha(0)

    #restyle the tick lines
    for line in ax.get_xticklines() + ax.get_yticklines():
        line.set_markersize(5)
        line.set_color("gray")
        line.set_markeredgewidth(1.4)

    #remove the minor tick lines
    for line in ax.xaxis.get_ticklines(minor=True) + ax.yaxis.get_ticklines(minor=True):
        line.set_markersize(0)

    #only show bottom left ticks, pointing out of axis
    rcParams['xtick.direction'] = 'out'
    rcParams['ytick.direction'] = 'out'
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')


    if ax.legend_ <> None:
        lg = ax.legend_
        lg.get_frame().set_linewidth(0)
        lg.get_frame().set_alpha(0.5)


def rhist(ax, data, **keywords):
    """Creates a histogram with default style parameters to look like ggplot2
    Is equivalent to calling ax.hist and accepts the same keyword parameters.
    If style parameters are explicitly defined, they will not be overwritten
    """

    defaults = {
                'facecolor' : '0.3',
                'edgecolor' : '0.28',
                'linewidth' : '1',
                'bins' : 100
                }

    for k, v in defaults.items():
        if k not in keywords: keywords[k] = v

    return ax.hist(data, **keywords)


def rbox(ax, data, **keywords):
    """Creates a ggplot2 style boxplot, is eqivalent to calling ax.boxplot with the following additions:

    Keyword arguments:
    colors -- array-like collection of colours for box fills
    names -- array-like collection of box names which are passed on as tick labels

    """

    hasColors = 'colors' in keywords
    if hasColors:
        colors = keywords['colors']
        keywords.pop('colors')

    if 'names' in keywords:
        ax.tickNames = plt.setp(ax, xticklabels=keywords['names'] )
        keywords.pop('names')

    bp = ax.boxplot(data, **keywords)
    pylab.setp(bp['boxes'], color='black')
    pylab.setp(bp['whiskers'], color='black', linestyle = 'solid')
    pylab.setp(bp['fliers'], color='black', alpha = 0.9, marker= 'o', markersize = 3)
    pylab.setp(bp['medians'], color='black')

    numBoxes = len(data)
    for i in range(numBoxes):
        box = bp['boxes'][i]
        boxX = []
        boxY = []
        for j in range(5):
          boxX.append(box.get_xdata()[j])
          boxY.append(box.get_ydata()[j])
        boxCoords = zip(boxX,boxY)

        if hasColors:
            boxPolygon = Polygon(boxCoords, facecolor = colors[i % len(colors)])
        else:
            boxPolygon = Polygon(boxCoords, facecolor = '0.95')

        ax.add_patch(boxPolygon)
    return bp