In [1]:
from pandas import DataFrame, Series
import pandas as pd
iris_data = pd.read_csv("/home/mridul/nilmtk/iris.data", names=['Sepal Length', 'Sepal Width', 'Petal Length', \
'Petal Width', 'Class'])
We extract the Data given and convert it into a Dataframe object
In [2]:
iris_data[:5]
Out[2]:
In [55]:
print 'Distribution of Dataset as per classes'
class_count = iris_data['Class'].value_counts()
print class_count
Initial Distribution of Dataset into their Respective Classes
In [20]:
%matplotlib inline
import matplotlib.pyplot as plt
class_count.plot(kind='barh', rot=0, xlim=(0,60))
Out[20]:
In [34]:
print "Printing the Total area of the length summations of all lengths and widths"
iris_data.plot(kind='area', color=['red','blue','green','orange'], ylim=(0,30), title="Area Graph")
plt.show()
Noticing certain trends in the Dataset w.r.t. the total summation of all lengths and breaths according to class as the dataset is sorted by the Class itself.
In [52]:
print "We will now try to measure the overall and class-wise medians and standard deviations respectively."
print iris_data.ix[[2,78,132]] #random data to test ix
In [53]:
data_mean = iris_data.mean()
data_std = iris_data.std()
print "Iris Dataset overall mean\n",data_mean
print '\nIris Dataset overall standard deviation\n',data_std
In [60]:
class_val = iris_data[:-1]['Class'].unique()
for i in class_val:
print i,
In [65]:
for i in class_val:
cur_class = iris_data[iris_data['Class'] == i]
print cur_class[:2]
Finding the Mean and Deviation for every single class and making a new dataframe.
This also includes the Deviation of these values from the overall mean and standard deviation.
In [108]:
data_mean = iris_data.mean()
data_std = iris_data.std()
data = []
for j in list(iris_data.columns)[:-1]:
data+=[('All', j, data_mean[j], 0.0, data_std[j], 0.0)]
#Initialized the Dataset to be added to the Dataframe
for i in list(class_val):
data_mean_temp = iris_data[iris_data['Class'] == i].mean()
data_std_temp = iris_data[iris_data['Class'] == i].std()
for j in list(iris_data.columns)[:-1]:
mean_diff_temp = data_mean_temp[j] - data_mean[j]
std_diff_temp = data_std_temp[j] - data_std[j]
data+=[(i, j, data_mean_temp[j], mean_diff_temp, data_std_temp[j], std_diff_temp)]
plot_df = pd.DataFrame(data,columns=['Class','Type','Mean','Mean Var','Deviation','Dev Var'])
print plot_df.sort('Class')
In [145]:
for j in list(iris_data.columns)[:-4]:
plot_df[plot_df['Type'] == j].plot(kind='bar', title=j,x='Class',figsize=(9, 4))
plt.axhline(data_mean[j], color='black')
plt.axhline(data_std[j], color='black')
By above analysis, we can determine for each particular class which Type of measurement is it above the average or below the average. Also, we can see for ourselves the standard deviations and determine how distant the dataset is in terms of ranges from the overall values.
Example: We notice that the mean width of Petal width of Iris-Virginica is much more that the average mean. However the standard deviation is much lesser that the average values, suggesting more closed values with high magnitude.
In [154]:
iris_data['Total Length'] = iris_data['Sepal Length'] + iris_data['Petal Length']
iris_data['Total Width'] = iris_data['Sepal Width'] + iris_data['Petal Width']
iris_data['Total'] = iris_data['Total Length'] + iris_data['Total Width']
print iris_data[:5]
Finding Percentile (Upper or Lower) can be obtained by sorting according to the field and obtaining the upper or lower fields.
In [188]:
print "Calculating Upper Percentile for any field that is required. \nFor eg. the total lengths summed up coming in the top 25%"
n = 25
values = int(iris_data.shape[0]*n/100.0)
print "Adding top", values,"values"
print iris_data[:-1].sort(ascending=False, columns='Total')['Total'][:values+1]
print "\n\nCalculating Lower Percentile for any field that is required. \nFor eg. the total lengths summed up coming in the last 15%"
m = 15
values = int(iris_data.shape[0]*m/100.0)
print "Adding last", values,"values"
print iris_data[:-1].sort(ascending=True, columns='Total')['Total'][:values+1]
Final Plotting to compare overall lengths, widths and summation of all dimensions in each class.
We also compare all graphs as one to estimate which class has the most and least dimensions on an average.
In [199]:
for i in iris_data[:-1]['Class'].unique():
(iris_data[iris_data['Class'] == i])[['Total Length', 'Total Width', 'Total']].plot(kind="area", stacked=True, title=i)
iris_data[['Total Length', 'Total Width', 'Total']].plot(kind="area", stacked=True, title="Overall")
Out[199]: