$ python -m spacy download en_core_web_lg
import en_core_web_lg
nlp = en_core_web_lg.load()
$ python -m spacy download en_core_web_lg
import en_core_web_lg
nlp = en_core_web_lg.load()
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input"))
# Any results you write to the current directory are saved as output.
Importing pandas and Seaborn module
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore') #this will ignore the warnings.it wont display warnings in notebook
Importing Iris data set
iris=pd.read_csv('../input/Iris.csv')
Displaying data
iris.head()
iris.drop('Id',axis=1,inplace=True)
Checking if there are any missing values
iris.info()
iris['Species'].value_counts()
This data set has three varities of Iris plant.
1. Describing the data
iris.describe().plot(kind = "area",fontsize=27, figsize = (20,8), table = True,colormap="rainbow")
plt.xlabel('Statistics',)
plt.ylabel('Value')
plt.title("General Statistics of Iris Dataset")
Above plot gives us a General Idea about the dataset.
2.Bar Plot : Here the frequency of the observation is plotted.In this case we are plotting the frequency of the three species in the Iris Dataset
#f,ax=plt.subplots(1,2,figsize=(18,8))
sns.countplot('Species',data=iris)
#ax.set_title('Iris Species Count')
plt.show()
3. Pie Chart :
#f,ax=plt.subplots(1,2,figsize=(18,8))
iris['Species'].value_counts().plot.pie(explode=[0.1,0.1,0.1],autopct='%1.1f%%',shadow=True,figsize=(10,8))
#iris['Species'].value_counts().plot.pie(explode=[0.1,0.1,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True)
#ax[0].set_title('Iris Species Count')
#ax[0].set_ylabel('Count')
#sns.countplot('Species',data=iris,ax=ax[1])
#ax[1].set_title('Iris Species Count')
plt.show()
We can see that there are 50 samples each of all the Iris Species in the data set.
4. Joint plot: Jointplot is seaborn library specific and can be used to quickly visualize and analyze the relationship between two variables and describe their individual distributions on the same plot.
fig=sns.jointplot(x='SepalLengthCm',y='SepalWidthCm',data=iris)
sns.jointplot("SepalLengthCm", "SepalWidthCm", data=iris, kind="reg")
fig=sns.jointplot(x='SepalLengthCm',y='SepalWidthCm',kind='hex',data=iris)
sns.jointplot("SepalLengthCm", "SepalWidthCm", data=iris, kind="kde",space=0,color='g')
g = (sns.jointplot("SepalLengthCm", "SepalWidthCm",data=iris, color="k").plot_joint(sns.kdeplot, zorder=0, n_levels=6))
5. FacetGrid Plot
import matplotlib.pyplot as plt
%matplotlib inline
sns.FacetGrid(iris,hue='Species',size=5)
.map(plt.scatter,'SepalLengthCm','SepalWidthCm')
.add_legend()
6. Boxplot or Whisker plot Box plot was was first introduced in year 1969 by Mathematician John Tukey.Box plot give a statical summary of the features being plotted.Top line represent the max value,top edge of box is third Quartile, middle edge represents the median,bottom edge represents the first quartile value.The bottom most line respresent the minimum value of the feature.The height of the box is called as Interquartile range.The black dots on the plot represent the outlier values in the data.
fig=plt.gcf()
fig.set_size_inches(10,7)
fig=sns.boxplot(x='Species',y='PetalLengthCm',data=iris,order=['Iris-virginica','Iris-versicolor','Iris-setosa'],linewidth=2.5,orient='v',dodge=False)
#iris.drop("Id", axis=1).boxplot(by="Species", figsize=(12, 6))
iris.boxplot(by="Species", figsize=(12, 6))
7. Strip plot
fig=plt.gcf()
fig.set_size_inches(10,7)
fig=sns.stripplot(x='Species',y='SepalLengthCm',data=iris,jitter=True,edgecolor='gray',size=8,palette='winter',orient='v')
8. Combining Box and Strip Plots
fig=plt.gcf()
fig.set_size_inches(10,7)
fig=sns.boxplot(x='Species',y='SepalLengthCm',data=iris)
fig=sns.stripplot(x='Species',y='SepalLengthCm',data=iris,jitter=True,edgecolor='gray')
ax= sns.boxplot(x="Species", y="PetalLengthCm", data=iris)
ax= sns.stripplot(x="Species", y="PetalLengthCm", data=iris, jitter=True, edgecolor="gray")
boxtwo = ax.artists[2]
boxtwo.set_facecolor('yellow')
boxtwo.set_edgecolor('black')
boxthree=ax.artists[1]
boxthree.set_facecolor('red')
boxthree.set_edgecolor('black')
boxthree=ax.artists[0]
boxthree.set_facecolor('green')
boxthree.set_edgecolor('black')
plt.show()
9. Violin Plot It is used to visualize the distribution of data and its probability distribution.This chart is a combination of a Box Plot and a Density Plot that is rotated and placed on each side, to show the distribution shape of the data. The thick black bar in the centre represents the interquartile range, the thin black line extended from it represents the 95% confidence intervals, and the white dot is the median.Box Plots are limited in their display of the data, as their visual simplicity tends to hide significant details about how values in the data are distributed
fig=plt.gcf()
fig.set_size_inches(10,7)
fig=sns.violinplot(x='Species',y='SepalLengthCm',data=iris)
plt.figure(figsize=(15,10))
plt.subplot(2,2,1)
sns.violinplot(x='Species',y='PetalLengthCm',data=iris)
plt.subplot(2,2,2)
sns.violinplot(x='Species',y='PetalWidthCm',data=iris)
plt.subplot(2,2,3)
sns.violinplot(x='Species',y='SepalLengthCm',data=iris)
plt.subplot(2,2,4)
sns.violinplot(x='Species',y='SepalWidthCm',data=iris)
10. Pair Plot: A “pairs plot” is also known as a scatterplot, in which one variable in the same data row is matched with another variable's value, like this: Pairs plots are just elaborations on this, showing all variables paired with all the other variables.
sns.pairplot(data=iris,kind='scatter')
sns.pairplot(iris,hue='Species')
11. Heat map Heat map is used to find out the correlation between different features in the dataset.High positive or negative value shows that the features have high correlation.This helps us to select the parmeters for machine learning.
fig=plt.gcf()
fig.set_size_inches(10,7)
fig=sns.heatmap(iris.corr(),annot=True,cmap='cubehelix',linewidths=1,linecolor='k',square=True,mask=False, vmin=-1, vmax=1,cbar_kws={"orientation": "vertical"},cbar=True)
12. Distribution plot: The distribution plot is suitable for comparing range and distribution for groups of numerical data. Data is plotted as value points along an axis. You can choose to display only the value points to see the distribution of values, a bounding box to see the range of values, or a combination of both as shown here.The distribution plot is not relevant for detailed analysis of the data as it deals with a summary of the data distribution.
iris.hist(edgecolor='black', linewidth=1.2)
fig=plt.gcf()
fig.set_size_inches(12,6)
13. Swarm plot It looks a bit like a friendly swarm of bees buzzing about their hive. More importantly, each data point is clearly visible and no data are obscured by overplotting.A beeswarm plot improves upon the random jittering approach to move data points the minimum distance away from one another to avoid overlays. The result is a plot where you can see each distinct data point, like shown in below plot
sns.set(style="darkgrid")
fig=plt.gcf()
fig.set_size_inches(10,7)
fig = sns.swarmplot(x="Species", y="PetalLengthCm", data=iris)
14. Box and Swarm plot combined
sns.set(style="darkgrid")
fig=plt.gcf()
fig.set_size_inches(10,7)
fig= sns.boxplot(x="Species", y="PetalLengthCm", data=iris, whis=np.inf)
fig= sns.swarmplot(x="Species", y="PetalLengthCm", data=iris, color=".2")
15. Swarm and Violin plot combined
sns.set(style="whitegrid")
fig=plt.gcf()
fig.set_size_inches(10,7)
ax = sns.violinplot(x="Species", y="PetalLengthCm", data=iris, inner=None)
ax = sns.swarmplot(x="Species", y="PetalLengthCm", data=iris,color="white", edgecolor="black")
16. Species based classification
sns.set(style="darkgrid")
sc=iris[iris.Species=='Iris-setosa'].plot(kind='scatter',x='SepalLengthCm',y='SepalWidthCm',color='red',label='Setosa')
iris[iris.Species=='Iris-versicolor'].plot(kind='scatter',x='SepalLengthCm',y='SepalWidthCm',color='green',label='Versicolor',ax=sc)
iris[iris.Species=='Iris-virginica'].plot(kind='scatter',x='SepalLengthCm',y='SepalWidthCm',color='orange', label='virginica', ax=sc)
sc.set_xlabel('Sepal Length in cm')
sc.set_ylabel('Sepal Width in cm')
sc.set_title('Sepal Length Vs Sepal Width')
sc=plt.gcf()
sc.set_size_inches(10,6)
17. LM PLot
fig=sns.lmplot(x="PetalLengthCm", y="PetalWidthCm",data=iris)
18. FacetGrid
sns.FacetGrid(iris, hue="Species", size=6)
.map(sns.kdeplot, "PetalLengthCm")
.add_legend()
plt.ioff()
19. Andrews Curve: In data visualization, an Andrews plot or Andrews curve is a way to visualize structure in high-dimensional data. It is basically a rolled-down, non-integer version of the Kent–Kiviat radar m chart, or a smoothened version of a parallel coordinate plot.In Pandas use Andrews Curves to plot and visualize data structure.Each multivariate observation is transformed into a curve and represents the coefficients of a Fourier series.This useful for detecting outliers in times series data.Use colormap to change the color of the curves
from pandas.tools.plotting import andrews_curves
andrews_curves(iris,"Species",colormap='rainbow')
plt.show()
plt.ioff()
20. Parallel coordinate plot: This type of visualisation is used for plotting multivariate, numerical data. Parallel Coordinates Plots are ideal for comparing many variables together and seeing the relationships between them. For example, if you had to compare an array of products with the same attributes (comparing computer or cars specs across different models).
from pandas.tools.plotting import parallel_coordinates
parallel_coordinates(iris, "Species")
21. Radviz Plot RadViz Visualizer. RadViz is a multivariate data visualization algorithm that plots each feature dimension uniformly around the circumference of a circle then plots points on the interior of the circle such that the point normalizes its values on the axes from the center to each arc.
from pandas.tools.plotting import radviz
radviz(iris, "Species")
22. Factor Plot
#f,ax=plt.subplots(1,2,figsize=(18,8))
sns.factorplot('Species','SepalLengthCm',data=iris)
plt.ioff()
plt.show()
#sns.factorplot('Species','SepalLengthCm',data=iris,ax=ax[0][0])
#sns.factorplot('Species','SepalWidthCm',data=iris,ax=ax[0][1])
#sns.factorplot('Species','PetalLengthCm',data=iris,ax=ax[1][0])
#sns.factorplot('Species','PetalWidthCm',data=iris,ax=ax[1][1])
23. Boxen Plot|
fig=plt.gcf()
fig.set_size_inches(10,7)
fig=sns.boxenplot(x='Species',y='SepalLengthCm',data=iris)
24.Residual Plot : The most useful way to plot the residuals, though, is with your predicted values on the x-axis, and your residuals on the y-axis. The distance from the line at 0 is how bad the prediction was for that value.
fig=plt.gcf()
fig.set_size_inches(10,7)
fig=sns.residplot('SepalLengthCm', 'SepalWidthCm',data=iris,lowess=True)
25.Venn Diagram : A Venn diagram (also called primary diagram, set diagram or logic diagram) is a diagram that shows all possible logical relations between a finite collection of different sets. Each set is represented by a circle. The circle size represents the importance of the group. The groups are usually overlapping: the size of the overlap represents the intersection between both groups.
# venn2
from matplotlib_venn import venn2
sepal_length = iris.iloc[:,0]
sepal_width = iris.iloc[:,1]
petal_length = iris.iloc[:,2]
petal_width = iris.iloc[:,3]
# First way to call the 2 group Venn diagram
venn2(subsets = (len(sepal_length)-15, len(sepal_width)-15, 15), set_labels = ('sepal_length', 'sepal_width'))
plt.show()
26. Spider Graph
from math import pi
categories = list(iris)[:4]
N = len(categories)
angles = [ n / float(N)*2*pi for n in range(N)]
angles = angles + angles[:1]
plt.figure(figsize = (10,10))
ax = plt.subplot(111,polar = True)
ax.set_theta_offset(pi/2)
ax.set_theta_direction(-1)
plt.xticks(angles[:-1],categories)
ax.set_rlabel_position(0)
plt.yticks([0,2,4,6],["0","2","4","6"],color= "red", size = 7)
plt.ylim(0,6)
values = iris.loc[0].drop("Species").values.flatten().tolist()
values = values + values[:1]
ax.plot(angles,values,linewidth = 1,linestyle="solid",label ="setosa" )
ax.fill(angles,values,"b",alpha=0.1)
values = iris.loc[1].drop("Species").values.flatten().tolist()
values = values + values[:1]
ax.plot(angles,values,linewidth = 1,linestyle="solid",label ="versicolor" )
ax.fill(angles,values,"orange",alpha=0.1)
plt.legend(loc = "upper left",bbox_to_anchor = (0.1,0.1))
plt.show()
27.Donut plot
# donut plot
feature_names = "sepal_length","sepal_width","petal_length","petal_width"
feature_size = [len(sepal_length),len(sepal_width),len(petal_length),len(petal_width)]
# create a circle for the center of plot
circle = plt.Circle((0,0),0.2,color = "white")
plt.pie(feature_size, labels = feature_names, colors = ["red","green","blue","cyan"] )
p = plt.gcf()
p.gca().add_artist(circle)
plt.title("Number of Each Features")
plt.show()
28.KDE Plot
# Create a kde plot of sepal_length versus sepal width for setosa species of flower.
sub=iris[iris['Species']=='Iris-setosa']
sns.kdeplot(data=sub[['SepalLengthCm','SepalWidthCm']],cmap="plasma", shade=True, shade_lowest=False)
plt.title('Iris-setosa')
plt.xlabel('Sepal Length Cm')
plt.ylabel('Sepal Width Cm')
import seaborn as sns
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="ticks", color_codes=True)
iris=pd.read_csv('../input/Iris.csv')
iris.drop('Id',axis=1,inplace=True)
print(iris.head())
#sns.pairplot(iris)
sns.pairplot(iris, hue="Species", palette="husl", markers=["o", "s", "D"])
import seaborn as sns
import pandas as pd
sns.set(style="darkgrid")
tips = pd.read_csv('../input/tips.csv')
print(tips.head())
#sns.load_dataset("tips")
g = sns.jointplot("total_bill", "tip", data=tips, kind="reg",
xlim=(0, 60), ylim=(0, 12), color="m", height=7)
In this notebook we will explore making pairplots in Python using the seaborn visualization library. We'll start with the default sns.pairplot
and then look at customizing our plots using sns.PairGrids
.
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np
# matplotlib for plotting
import matplotlib.pyplot as plt
import matplotlib
# Set text size
matplotlib.rcParams['font.size'] = 18
# Seaborn for pairplots
import seaborn as sns
sns.set_context('talk', font_scale=1.2);
We will be using GapMinder socioeconomic data that is available in the R package gapminder
. The data has been saved to a csv file which we will read into a dataframe. There are six columns in the data:
df = pd.read_csv('../input/gapminder-data/gapminder_data.csv')
df.columns = ['country', 'continent', 'year', 'life_exp', 'pop', 'gdp_per_cap']
df.head()
We can quickly find summary stats for the data using the describe
method of a dataframe.
df.describe()
Let's use the entire dataset and sns.pairplot
to create a simple, yet useful plot.
sns.pairplot(df);
The default pairplot shows scatter plots between variables on the upper and lower triangle and histograms along the diagonal. Already, we can see some trends such as a positive correlation between gdp_per_cap
and life_exp
and year
and life_exp
which suggests that people in richer countries live longer and that in general, people have been living longer as time increases. We can't say what causes theses trends, only that there is a correlation.
We can also see that the distribution of pop
and gdp_per_cap
is heavily skewed to the right. To better represent the data, we can take the log transform of those columns.
df['log_pop'] = np.log10(df['pop'])
df['log_gdp_per_cap'] = np.log10(df['gdp_per_cap'])
df = df.drop(columns = ['pop', 'gdp_per_cap'])
In order to better understand the data, we can color the pairplot
using a categorical variable and the hue
keyword. First, we will color the plots by the continent.
matplotlib.rcParams['font.size'] = 40
sns.pairplot(df, hue = 'continent');
I don't find stacked histograms (on the diagonal) to be very useful, and there are some issues with overlapping data points (known as overplotting). We can fix these by adding in a few customizations to the pairplot
call.
pairplot
¶
First, let's change the diagonal from a histogram to a kde which can better show the differences between continents. We can also adjust the alpha (intensity) of the scatter plots to better show all the data and change the size of the markers on the scatter plot. Finally, I increase the size of all the plots to better show the data.
sns.pairplot(df, hue = 'continent', diag_kind = 'kde', plot_kws = {'alpha': 0.6, 's': 80, 'edgecolor': 'k'}, size = 4);
That makes some of the trends more clear. We can see that Oceania and Europe tend to have the highest life expectancy and highest GDP with Asian countries tending to have the greatest population. The density plots on the diagonal are better for when we have data in multiple categories to make comparisons. We can color the plot by any variable we like. For example, here is a plot colored by a decade categorical variable we create from the year column.
df['decade'] = pd.cut(df['year'], bins = range(1950, 2010, 10))
df.head()
sns.pairplot(df, hue = 'decade', diag_kind = 'kde', vars = ['life_exp', 'log_pop', 'log_gdp_per_cap'],
plot_kws = {'alpha': 0.6, 's': 80, 'edgecolor': 'k'}, size = 4);
In this case, we can know see that life expectancy has increased over the decades as has population. Retaining the year variable might not make much sense when we are already coloring by the decade.
There is still quite a lot of noise on the scatter plots, mostly because we are plotting many years at once. Let's limit ourselves to the most recent year in the data. Notice how we must now use the vars
keyword to specify the variables we want to plot. It does not make sense to plot the year variable since it no longer varies. We will limit the plot to the three remaining numerical variables.
sns.pairplot(df[df['year'] >= 2000], vars = ['life_exp', 'log_pop', 'log_gdp_per_cap'],
hue = 'continent', diag_kind = 'kde', plot_kws = {'alpha': 0.6, 's': 80, 'edgecolor': 'k'}, size = 4);
plt.suptitle('Pair Plot of Socioeconomic Data for 2000-2007', size = 28);
sns.PairGrid
¶
When the options offered by pairplot are not enough, we can move on to more powerful PairGrid. This allows us to define our own functions to map to the lower and upper triangles and the diagonal. For example, we might want a plot that instead of showing two instaces of the scatter plots, shows the Pearson Correlation coefficient (a measure of a linear trend) on one of the triangles. To do this, we can just write a function to calculate the statistic and then map it to the appropriate part of the plot.
First, we will show the basic usage of sns.PairGrid
. Here, we map a scatter plot to the upper triangle, a density plot to the diagonal, and a 2D density plot to the lower triangle. PairGrid
is a class and not a function, which means that we need to create an instance and then use methods of that instance to build a plot. Then, after we have added all the methods to the instance, we can show the resulting plot.
# Create an instance of the PairGrid class.
grid = sns.PairGrid(data= df[df['year'] == 2007],
vars = ['life_exp', 'log_pop', 'log_gdp_per_cap'], size = 4)
# Map different plots to different sections
grid = grid.map_upper(plt.scatter, color = 'darkred')
grid = grid.map_lower(sns.kdeplot, cmap = 'Reds')
grid = grid.map_diag(plt.hist, bins = 10, color = 'darkred', edgecolor = 'k');
Now that we see how to map different functions to the different elements, we can write out own function to put on the plot. We'll use a simple function to show the correlation coffiecients on the scatterplot. (Thanks to this Stack Overflow answer for help on how to write a custom function and map it onto the plot).
# Function to calculate correlation coefficient between two arrays
def corr(x, y, **kwargs):
# Calculate the value
coef = np.corrcoef(x, y)[0][1]
# Make the label
label = r'$rho$ = ' + str(round(coef, 2))
# Add the label to the plot
ax = plt.gca()
ax.annotate(label, xy = (0.2, 0.95), size = 20, xycoords = ax.transAxes)
# Create a pair grid instance
grid = sns.PairGrid(data= df[df['year'] == 2007],
vars = ['life_exp', 'log_pop', 'log_gdp_per_cap'], size = 4)
# Map the plots to the locations
grid = grid.map_upper(plt.scatter, color = 'darkred')
grid = grid.map_upper(corr)
grid = grid.map_lower(sns.kdeplot, cmap = 'Reds')
grid = grid.map_diag(plt.hist, bins = 10, edgecolor = 'k', color = 'darkred');
We can map any function we would like to any of the areas. For example, maybe we would like to show the summary stats on the diagonal.
# Define a summary function
def summary(x, **kwargs):
# Convert to a pandas series
x = pd.Series(x)
# Get stats for the series
label = x.describe()[['mean', 'std', 'min', '50%', 'max']]
# Convert from log to regular scale
# Adjust the column names for presentation
if label.name == 'log_pop':
label = 10 ** label
label.name = 'pop stats'
elif label.name == 'log_gdp_per_cap':
label = 10 ** label
label.name = 'gdp_per_cap stats'
else:
label.name = 'life_exp stats'
# Round the labels for presentation
label = label.round()
ax = plt.gca()
ax.set_axis_off()
print(label)
# Add the labels to the plot
#ax.annotate(pd.DataFrame(label),xy = (0.1, 0.2), size = 20, xycoords = ax.transAxes)
# Create a pair grid instance
grid = sns.PairGrid(data= df[df['year'] == 2007],
vars = ['life_exp', 'log_pop', 'log_gdp_per_cap'], size = 4)
# Fill in the mappings
grid = grid.map_upper(plt.scatter, color = 'darkred')
grid = grid.map_upper(corr)
grid = grid.map_lower(sns.kdeplot, cmap = 'Reds')
grid = grid.map_diag(summary);
We can extend this however we like in order to investigate the data. For most use cases, the sns.pairplot
function will do everything we require, but if we need the extra options, we can always use the more powerful sns.PairGrid
. Pair plots are a great method to get a first look at a dataset, and seaborn has extensive capabilities for producing these figures!