123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640 |
- # being a bit too dynamic
- # pylint: disable=E1101
- from __future__ import division
- import numpy as np
- from pandas.compat import lmap, lrange, range, zip
- from pandas.util._decorators import deprecate_kwarg
- from pandas.core.dtypes.missing import notna
- from pandas.io.formats.printing import pprint_thing
- from pandas.plotting._style import _get_standard_colors
- from pandas.plotting._tools import _set_ticks_props, _subplots
- def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False,
- diagonal='hist', marker='.', density_kwds=None,
- hist_kwds=None, range_padding=0.05, **kwds):
- """
- Draw a matrix of scatter plots.
- Parameters
- ----------
- frame : DataFrame
- alpha : float, optional
- amount of transparency applied
- figsize : (float,float), optional
- a tuple (width, height) in inches
- ax : Matplotlib axis object, optional
- grid : bool, optional
- setting this to True will show the grid
- diagonal : {'hist', 'kde'}
- pick between 'kde' and 'hist' for
- either Kernel Density Estimation or Histogram
- plot in the diagonal
- marker : str, optional
- Matplotlib marker type, default '.'
- hist_kwds : other plotting keyword arguments
- To be passed to hist function
- density_kwds : other plotting keyword arguments
- To be passed to kernel density estimate plot
- range_padding : float, optional
- relative extension of axis range in x and y
- with respect to (x_max - x_min) or (y_max - y_min),
- default 0.05
- kwds : other plotting keyword arguments
- To be passed to scatter function
- Examples
- --------
- >>> df = pd.DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
- >>> scatter_matrix(df, alpha=0.2)
- """
- df = frame._get_numeric_data()
- n = df.columns.size
- naxes = n * n
- fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax,
- squeeze=False)
- # no gaps between subplots
- fig.subplots_adjust(wspace=0, hspace=0)
- mask = notna(df)
- marker = _get_marker_compat(marker)
- hist_kwds = hist_kwds or {}
- density_kwds = density_kwds or {}
- # GH 14855
- kwds.setdefault('edgecolors', 'none')
- boundaries_list = []
- for a in df.columns:
- values = df[a].values[mask[a].values]
- rmin_, rmax_ = np.min(values), np.max(values)
- rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
- boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext))
- for i, a in zip(lrange(n), df.columns):
- for j, b in zip(lrange(n), df.columns):
- ax = axes[i, j]
- if i == j:
- values = df[a].values[mask[a].values]
- # Deal with the diagonal by drawing a histogram there.
- if diagonal == 'hist':
- ax.hist(values, **hist_kwds)
- elif diagonal in ('kde', 'density'):
- from scipy.stats import gaussian_kde
- y = values
- gkde = gaussian_kde(y)
- ind = np.linspace(y.min(), y.max(), 1000)
- ax.plot(ind, gkde.evaluate(ind), **density_kwds)
- ax.set_xlim(boundaries_list[i])
- else:
- common = (mask[a] & mask[b]).values
- ax.scatter(df[b][common], df[a][common],
- marker=marker, alpha=alpha, **kwds)
- ax.set_xlim(boundaries_list[j])
- ax.set_ylim(boundaries_list[i])
- ax.set_xlabel(b)
- ax.set_ylabel(a)
- if j != 0:
- ax.yaxis.set_visible(False)
- if i != n - 1:
- ax.xaxis.set_visible(False)
- if len(df.columns) > 1:
- lim1 = boundaries_list[0]
- locs = axes[0][1].yaxis.get_majorticklocs()
- locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])]
- adj = (locs - lim1[0]) / (lim1[1] - lim1[0])
- lim0 = axes[0][0].get_ylim()
- adj = adj * (lim0[1] - lim0[0]) + lim0[0]
- axes[0][0].yaxis.set_ticks(adj)
- if np.all(locs == locs.astype(int)):
- # if all ticks are int
- locs = locs.astype(int)
- axes[0][0].yaxis.set_ticklabels(locs)
- _set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)
- return axes
- def _get_marker_compat(marker):
- import matplotlib.lines as mlines
- if marker not in mlines.lineMarkers:
- return 'o'
- return marker
- def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds):
- """
- Plot a multidimensional dataset in 2D.
- Each Series in the DataFrame is represented as a evenly distributed
- slice on a circle. Each data point is rendered in the circle according to
- the value on each Series. Highly correlated `Series` in the `DataFrame`
- are placed closer on the unit circle.
- RadViz allow to project a N-dimensional data set into a 2D space where the
- influence of each dimension can be interpreted as a balance between the
- influence of all dimensions.
- More info available at the `original article
- <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.135.889>`_
- describing RadViz.
- Parameters
- ----------
- frame : `DataFrame`
- Pandas object holding the data.
- class_column : str
- Column name containing the name of the data point category.
- ax : :class:`matplotlib.axes.Axes`, optional
- A plot instance to which to add the information.
- color : list[str] or tuple[str], optional
- Assign a color to each category. Example: ['blue', 'green'].
- colormap : str or :class:`matplotlib.colors.Colormap`, default None
- Colormap to select colors from. If string, load colormap with that
- name from matplotlib.
- kwds : optional
- Options to pass to matplotlib scatter plotting method.
- Returns
- -------
- axes : :class:`matplotlib.axes.Axes`
- See Also
- --------
- pandas.plotting.andrews_curves : Plot clustering visualization.
- Examples
- --------
- .. plot::
- :context: close-figs
- >>> df = pd.DataFrame({
- ... 'SepalLength': [6.5, 7.7, 5.1, 5.8, 7.6, 5.0, 5.4, 4.6,
- ... 6.7, 4.6],
- ... 'SepalWidth': [3.0, 3.8, 3.8, 2.7, 3.0, 2.3, 3.0, 3.2,
- ... 3.3, 3.6],
- ... 'PetalLength': [5.5, 6.7, 1.9, 5.1, 6.6, 3.3, 4.5, 1.4,
- ... 5.7, 1.0],
- ... 'PetalWidth': [1.8, 2.2, 0.4, 1.9, 2.1, 1.0, 1.5, 0.2,
- ... 2.1, 0.2],
- ... 'Category': ['virginica', 'virginica', 'setosa',
- ... 'virginica', 'virginica', 'versicolor',
- ... 'versicolor', 'setosa', 'virginica',
- ... 'setosa']
- ... })
- >>> rad_viz = pd.plotting.radviz(df, 'Category') # doctest: +SKIP
- """
- import matplotlib.pyplot as plt
- import matplotlib.patches as patches
- def normalize(series):
- a = min(series)
- b = max(series)
- return (series - a) / (b - a)
- n = len(frame)
- classes = frame[class_column].drop_duplicates()
- class_col = frame[class_column]
- df = frame.drop(class_column, axis=1).apply(normalize)
- if ax is None:
- ax = plt.gca(xlim=[-1, 1], ylim=[-1, 1])
- to_plot = {}
- colors = _get_standard_colors(num_colors=len(classes), colormap=colormap,
- color_type='random', color=color)
- for kls in classes:
- to_plot[kls] = [[], []]
- m = len(frame.columns) - 1
- s = np.array([(np.cos(t), np.sin(t))
- for t in [2.0 * np.pi * (i / float(m))
- for i in range(m)]])
- for i in range(n):
- row = df.iloc[i].values
- row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1)
- y = (s * row_).sum(axis=0) / row.sum()
- kls = class_col.iat[i]
- to_plot[kls][0].append(y[0])
- to_plot[kls][1].append(y[1])
- for i, kls in enumerate(classes):
- ax.scatter(to_plot[kls][0], to_plot[kls][1], color=colors[i],
- label=pprint_thing(kls), **kwds)
- ax.legend()
- ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor='none'))
- for xy, name in zip(s, df.columns):
- ax.add_patch(patches.Circle(xy, radius=0.025, facecolor='gray'))
- if xy[0] < 0.0 and xy[1] < 0.0:
- ax.text(xy[0] - 0.025, xy[1] - 0.025, name,
- ha='right', va='top', size='small')
- elif xy[0] < 0.0 and xy[1] >= 0.0:
- ax.text(xy[0] - 0.025, xy[1] + 0.025, name,
- ha='right', va='bottom', size='small')
- elif xy[0] >= 0.0 and xy[1] < 0.0:
- ax.text(xy[0] + 0.025, xy[1] - 0.025, name,
- ha='left', va='top', size='small')
- elif xy[0] >= 0.0 and xy[1] >= 0.0:
- ax.text(xy[0] + 0.025, xy[1] + 0.025, name,
- ha='left', va='bottom', size='small')
- ax.axis('equal')
- return ax
- @deprecate_kwarg(old_arg_name='data', new_arg_name='frame')
- def andrews_curves(frame, class_column, ax=None, samples=200, color=None,
- colormap=None, **kwds):
- """
- Generates a matplotlib plot of Andrews curves, for visualising clusters of
- multivariate data.
- Andrews curves have the functional form:
- f(t) = x_1/sqrt(2) + x_2 sin(t) + x_3 cos(t) +
- x_4 sin(2t) + x_5 cos(2t) + ...
- Where x coefficients correspond to the values of each dimension and t is
- linearly spaced between -pi and +pi. Each row of frame then corresponds to
- a single curve.
- Parameters
- ----------
- frame : DataFrame
- Data to be plotted, preferably normalized to (0.0, 1.0)
- class_column : Name of the column containing class names
- ax : matplotlib axes object, default None
- samples : Number of points to plot in each curve
- color : list or tuple, optional
- Colors to use for the different classes
- colormap : str or matplotlib colormap object, default None
- Colormap to select colors from. If string, load colormap with that name
- from matplotlib.
- kwds : keywords
- Options to pass to matplotlib plotting method
- Returns
- -------
- ax : Matplotlib axis object
- """
- from math import sqrt, pi
- import matplotlib.pyplot as plt
- def function(amplitudes):
- def f(t):
- x1 = amplitudes[0]
- result = x1 / sqrt(2.0)
- # Take the rest of the coefficients and resize them
- # appropriately. Take a copy of amplitudes as otherwise numpy
- # deletes the element from amplitudes itself.
- coeffs = np.delete(np.copy(amplitudes), 0)
- coeffs.resize(int((coeffs.size + 1) / 2), 2)
- # Generate the harmonics and arguments for the sin and cos
- # functions.
- harmonics = np.arange(0, coeffs.shape[0]) + 1
- trig_args = np.outer(harmonics, t)
- result += np.sum(coeffs[:, 0, np.newaxis] * np.sin(trig_args) +
- coeffs[:, 1, np.newaxis] * np.cos(trig_args),
- axis=0)
- return result
- return f
- n = len(frame)
- class_col = frame[class_column]
- classes = frame[class_column].drop_duplicates()
- df = frame.drop(class_column, axis=1)
- t = np.linspace(-pi, pi, samples)
- used_legends = set()
- color_values = _get_standard_colors(num_colors=len(classes),
- colormap=colormap, color_type='random',
- color=color)
- colors = dict(zip(classes, color_values))
- if ax is None:
- ax = plt.gca(xlim=(-pi, pi))
- for i in range(n):
- row = df.iloc[i].values
- f = function(row)
- y = f(t)
- kls = class_col.iat[i]
- label = pprint_thing(kls)
- if label not in used_legends:
- used_legends.add(label)
- ax.plot(t, y, color=colors[kls], label=label, **kwds)
- else:
- ax.plot(t, y, color=colors[kls], **kwds)
- ax.legend(loc='upper right')
- ax.grid()
- return ax
- def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds):
- """
- Bootstrap plot on mean, median and mid-range statistics.
- The bootstrap plot is used to estimate the uncertainty of a statistic
- by relaying on random sampling with replacement [1]_. This function will
- generate bootstrapping plots for mean, median and mid-range statistics
- for the given number of samples of the given size.
- .. [1] "Bootstrapping (statistics)" in \
- https://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29
- Parameters
- ----------
- series : pandas.Series
- Pandas Series from where to get the samplings for the bootstrapping.
- fig : matplotlib.figure.Figure, default None
- If given, it will use the `fig` reference for plotting instead of
- creating a new one with default parameters.
- size : int, default 50
- Number of data points to consider during each sampling. It must be
- greater or equal than the length of the `series`.
- samples : int, default 500
- Number of times the bootstrap procedure is performed.
- **kwds :
- Options to pass to matplotlib plotting method.
- Returns
- -------
- fig : matplotlib.figure.Figure
- Matplotlib figure
- See Also
- --------
- pandas.DataFrame.plot : Basic plotting for DataFrame objects.
- pandas.Series.plot : Basic plotting for Series objects.
- Examples
- --------
- .. plot::
- :context: close-figs
- >>> s = pd.Series(np.random.uniform(size=100))
- >>> fig = pd.plotting.bootstrap_plot(s) # doctest: +SKIP
- """
- import random
- import matplotlib.pyplot as plt
- # random.sample(ndarray, int) fails on python 3.3, sigh
- data = list(series.values)
- samplings = [random.sample(data, size) for _ in range(samples)]
- means = np.array([np.mean(sampling) for sampling in samplings])
- medians = np.array([np.median(sampling) for sampling in samplings])
- midranges = np.array([(min(sampling) + max(sampling)) * 0.5
- for sampling in samplings])
- if fig is None:
- fig = plt.figure()
- x = lrange(samples)
- axes = []
- ax1 = fig.add_subplot(2, 3, 1)
- ax1.set_xlabel("Sample")
- axes.append(ax1)
- ax1.plot(x, means, **kwds)
- ax2 = fig.add_subplot(2, 3, 2)
- ax2.set_xlabel("Sample")
- axes.append(ax2)
- ax2.plot(x, medians, **kwds)
- ax3 = fig.add_subplot(2, 3, 3)
- ax3.set_xlabel("Sample")
- axes.append(ax3)
- ax3.plot(x, midranges, **kwds)
- ax4 = fig.add_subplot(2, 3, 4)
- ax4.set_xlabel("Mean")
- axes.append(ax4)
- ax4.hist(means, **kwds)
- ax5 = fig.add_subplot(2, 3, 5)
- ax5.set_xlabel("Median")
- axes.append(ax5)
- ax5.hist(medians, **kwds)
- ax6 = fig.add_subplot(2, 3, 6)
- ax6.set_xlabel("Midrange")
- axes.append(ax6)
- ax6.hist(midranges, **kwds)
- for axis in axes:
- plt.setp(axis.get_xticklabels(), fontsize=8)
- plt.setp(axis.get_yticklabels(), fontsize=8)
- return fig
- @deprecate_kwarg(old_arg_name='colors', new_arg_name='color')
- @deprecate_kwarg(old_arg_name='data', new_arg_name='frame', stacklevel=3)
- def parallel_coordinates(frame, class_column, cols=None, ax=None, color=None,
- use_columns=False, xticks=None, colormap=None,
- axvlines=True, axvlines_kwds=None, sort_labels=False,
- **kwds):
- """Parallel coordinates plotting.
- Parameters
- ----------
- frame : DataFrame
- class_column : str
- Column name containing class names
- cols : list, optional
- A list of column names to use
- ax : matplotlib.axis, optional
- matplotlib axis object
- color : list or tuple, optional
- Colors to use for the different classes
- use_columns : bool, optional
- If true, columns will be used as xticks
- xticks : list or tuple, optional
- A list of values to use for xticks
- colormap : str or matplotlib colormap, default None
- Colormap to use for line colors.
- axvlines : bool, optional
- If true, vertical lines will be added at each xtick
- axvlines_kwds : keywords, optional
- Options to be passed to axvline method for vertical lines
- sort_labels : bool, False
- Sort class_column labels, useful when assigning colors
- .. versionadded:: 0.20.0
- kwds : keywords
- Options to pass to matplotlib plotting method
- Returns
- -------
- ax: matplotlib axis object
- Examples
- --------
- >>> from matplotlib import pyplot as plt
- >>> df = pd.read_csv('https://raw.github.com/pandas-dev/pandas/master'
- '/pandas/tests/data/iris.csv')
- >>> pd.plotting.parallel_coordinates(
- df, 'Name',
- color=('#556270', '#4ECDC4', '#C7F464'))
- >>> plt.show()
- """
- if axvlines_kwds is None:
- axvlines_kwds = {'linewidth': 1, 'color': 'black'}
- import matplotlib.pyplot as plt
- n = len(frame)
- classes = frame[class_column].drop_duplicates()
- class_col = frame[class_column]
- if cols is None:
- df = frame.drop(class_column, axis=1)
- else:
- df = frame[cols]
- used_legends = set()
- ncols = len(df.columns)
- # determine values to use for xticks
- if use_columns is True:
- if not np.all(np.isreal(list(df.columns))):
- raise ValueError('Columns must be numeric to be used as xticks')
- x = df.columns
- elif xticks is not None:
- if not np.all(np.isreal(xticks)):
- raise ValueError('xticks specified must be numeric')
- elif len(xticks) != ncols:
- raise ValueError('Length of xticks must match number of columns')
- x = xticks
- else:
- x = lrange(ncols)
- if ax is None:
- ax = plt.gca()
- color_values = _get_standard_colors(num_colors=len(classes),
- colormap=colormap, color_type='random',
- color=color)
- if sort_labels:
- classes = sorted(classes)
- color_values = sorted(color_values)
- colors = dict(zip(classes, color_values))
- for i in range(n):
- y = df.iloc[i].values
- kls = class_col.iat[i]
- label = pprint_thing(kls)
- if label not in used_legends:
- used_legends.add(label)
- ax.plot(x, y, color=colors[kls], label=label, **kwds)
- else:
- ax.plot(x, y, color=colors[kls], **kwds)
- if axvlines:
- for i in x:
- ax.axvline(i, **axvlines_kwds)
- ax.set_xticks(x)
- ax.set_xticklabels(df.columns)
- ax.set_xlim(x[0], x[-1])
- ax.legend(loc='upper right')
- ax.grid()
- return ax
- def lag_plot(series, lag=1, ax=None, **kwds):
- """Lag plot for time series.
- Parameters
- ----------
- series : Time series
- lag : lag of the scatter plot, default 1
- ax : Matplotlib axis object, optional
- kwds : Matplotlib scatter method keyword arguments, optional
- Returns
- -------
- ax: Matplotlib axis object
- """
- import matplotlib.pyplot as plt
- # workaround because `c='b'` is hardcoded in matplotlibs scatter method
- kwds.setdefault('c', plt.rcParams['patch.facecolor'])
- data = series.values
- y1 = data[:-lag]
- y2 = data[lag:]
- if ax is None:
- ax = plt.gca()
- ax.set_xlabel("y(t)")
- ax.set_ylabel("y(t + {lag})".format(lag=lag))
- ax.scatter(y1, y2, **kwds)
- return ax
- def autocorrelation_plot(series, ax=None, **kwds):
- """Autocorrelation plot for time series.
- Parameters:
- -----------
- series: Time series
- ax: Matplotlib axis object, optional
- kwds : keywords
- Options to pass to matplotlib plotting method
- Returns:
- -----------
- ax: Matplotlib axis object
- """
- import matplotlib.pyplot as plt
- n = len(series)
- data = np.asarray(series)
- if ax is None:
- ax = plt.gca(xlim=(1, n), ylim=(-1.0, 1.0))
- mean = np.mean(data)
- c0 = np.sum((data - mean) ** 2) / float(n)
- def r(h):
- return ((data[:n - h] - mean) *
- (data[h:] - mean)).sum() / float(n) / c0
- x = np.arange(n) + 1
- y = lmap(r, x)
- z95 = 1.959963984540054
- z99 = 2.5758293035489004
- ax.axhline(y=z99 / np.sqrt(n), linestyle='--', color='grey')
- ax.axhline(y=z95 / np.sqrt(n), color='grey')
- ax.axhline(y=0.0, color='black')
- ax.axhline(y=-z95 / np.sqrt(n), color='grey')
- ax.axhline(y=-z99 / np.sqrt(n), linestyle='--', color='grey')
- ax.set_xlabel("Lag")
- ax.set_ylabel("Autocorrelation")
- ax.plot(x, y, **kwds)
- if 'label' in kwds:
- ax.legend()
- ax.grid()
- return ax
|