Data analysis and visulization with python

datascience

DATA ANALYSIS:PANDAS¶

In [3]:
import numpy as np
In [4]:
import pandas as pd
In [5]:
labels=['a','b','c']
my_data=[10,20,30]
arr=np.array(my_data)
d={'a':10,'b':20,'c':30}
In [6]:
arr
Out[6]:
array([10, 20, 30])

pandas series¶

In [7]:
pd.Series(data=my_data)
Out[7]:
0    10
1    20
2    30
dtype: int64
In [11]:
pd.Series(data=my_data,index=labels)
Out[11]:
a    10
b    20
c    30
dtype: int64
In [12]:
pd.Series(d)
Out[12]:
a    10
b    20
c    30
dtype: int64
In [13]:
pd.Series(arr)
Out[13]:
0    10
1    20
2    30
dtype: int32
In [14]:
pd.Series(labels)
Out[14]:
0    a
1    b
2    c
dtype: object
In [16]:
pd.Series([sum,print,len])
Out[16]:
0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object
In [19]:
a=pd.Series([1,2,3,4],['usa','india','china','ussr'])
In [20]:
a
Out[20]:
1      usa
2    india
3    china
4     ussr
dtype: object
In [7]:
b=pd.Series(data=[1,2,3,4],index=['delhi','mumbai','goa','imphaql'])
In [8]:
b
Out[8]:
delhi      1
mumbai     2
goa        3
imphaql    4
dtype: int64
In [10]:
b['delhi']
Out[10]:
1
In [27]:
c=pd.Series(data=labels)
In [28]:
c
Out[28]:
0    a
1    b
2    c
dtype: object
In [30]:
c[0]
Out[30]:
'a'
In [33]:
a+b
Out[33]:
1          NaN
2          NaN
3          NaN
4          NaN
delhi      NaN
goa        NaN
imphaql    NaN
mumbai     NaN
dtype: object

pandas dataframes¶

In [22]:
from numpy.random import randn
In [23]:
np.random.seed(101)
In [24]:
df=pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z']) #DATA,INDEX AND COLUMN
In [25]:
df
Out[25]:
W X Y Z
A 2.706850 0.628133 0.907969 0.503826
B 0.651118 -0.319318 -0.848077 0.605965
C -2.018168 0.740122 0.528813 -0.589001
D 0.188695 -0.758872 -0.933237 0.955057
E 0.190794 1.978757 2.605967 0.683509
In [26]:
df['W']
Out[26]:
A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64
In [27]:
type(df['W'])
Out[27]:
pandas.core.series.Series
In [28]:
type(df)
Out[28]:
pandas.core.frame.DataFrame
In [29]:
df.W
Out[29]:
A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64
In [30]:
df[['W','X']]
Out[30]:
W X
A 2.706850 0.628133
B 0.651118 -0.319318
C -2.018168 0.740122
D 0.188695 -0.758872
E 0.190794 1.978757
In [31]:
df['new']=df['W']+df['X']
In [32]:
df
Out[32]:
W X Y Z new
A 2.706850 0.628133 0.907969 0.503826 3.334983
B 0.651118 -0.319318 -0.848077 0.605965 0.331800
C -2.018168 0.740122 0.528813 -0.589001 -1.278046
D 0.188695 -0.758872 -0.933237 0.955057 -0.570177
E 0.190794 1.978757 2.605967 0.683509 2.169552
In [33]:
df.drop('new',axis=1)
Out[33]:
W X Y Z
A 2.706850 0.628133 0.907969 0.503826
B 0.651118 -0.319318 -0.848077 0.605965
C -2.018168 0.740122 0.528813 -0.589001
D 0.188695 -0.758872 -0.933237 0.955057
E 0.190794 1.978757 2.605967 0.683509
In [36]:
df.drop('E',axis=0)
Out[36]:
W X Y Z new
A 2.706850 0.628133 0.907969 0.503826 3.334983
B 0.651118 -0.319318 -0.848077 0.605965 0.331800
C -2.018168 0.740122 0.528813 -0.589001 -1.278046
D 0.188695 -0.758872 -0.933237 0.955057 -0.570177
In [37]:
df.shape
Out[37]:
(5, 5)
In [38]:
df
Out[38]:
W X Y Z new
A 2.706850 0.628133 0.907969 0.503826 3.334983
B 0.651118 -0.319318 -0.848077 0.605965 0.331800
C -2.018168 0.740122 0.528813 -0.589001 -1.278046
D 0.188695 -0.758872 -0.933237 0.955057 -0.570177
E 0.190794 1.978757 2.605967 0.683509 2.169552
In [39]:
df.loc['A'] #or df.iloc[index]
Out[39]:
W      2.706850
X      0.628133
Y      0.907969
Z      0.503826
new    3.334983
Name: A, dtype: float64
In [40]:
df.loc['A','X']
Out[40]:
0.6281327087844596
In [41]:
bool=df>0
In [42]:
df[bool]
Out[42]:
W X Y Z new
A 2.706850 0.628133 0.907969 0.503826 3.334983
B 0.651118 NaN NaN 0.605965 0.331800
C NaN 0.740122 0.528813 NaN NaN
D 0.188695 NaN NaN 0.955057 NaN
E 0.190794 1.978757 2.605967 0.683509 2.169552
In [43]:
df
Out[43]:
W X Y Z new
A 2.706850 0.628133 0.907969 0.503826 3.334983
B 0.651118 -0.319318 -0.848077 0.605965 0.331800
C -2.018168 0.740122 0.528813 -0.589001 -1.278046
D 0.188695 -0.758872 -0.933237 0.955057 -0.570177
E 0.190794 1.978757 2.605967 0.683509 2.169552
In [67]:
df['W']>0
Out[67]:
A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool
In [68]:
df[df['W']>0]
Out[68]:
W X Y Z new
A 2.706850 0.628133 0.907969 0.503826 3.334983
B 0.651118 -0.319318 -0.848077 0.605965 0.331800
D 0.188695 -0.758872 -0.933237 0.955057 -0.570177
E 0.190794 1.978757 2.605967 0.683509 2.169552
In [69]:
df[df['W']>0]['X']
Out[69]:
A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64
In [70]:
arr=df>0
In [73]:
df[arr]
Out[73]:
W X Y Z new
A 2.706850 0.628133 0.907969 0.503826 3.334983
B 0.651118 NaN NaN 0.605965 0.331800
C NaN 0.740122 0.528813 NaN NaN
D 0.188695 NaN NaN 0.955057 NaN
E 0.190794 1.978757 2.605967 0.683509 2.169552
In [78]:
df[(df['W']>0) & (df['X']<1)]
Out[78]:
W X Y Z new
A 2.706850 0.628133 0.907969 0.503826 3.334983
B 0.651118 -0.319318 -0.848077 0.605965 0.331800
D 0.188695 -0.758872 -0.933237 0.955057 -0.570177
In [80]:
df.reset_index()
Out[80]:
index W X Y Z new
0 A 2.706850 0.628133 0.907969 0.503826 3.334983
1 B 0.651118 -0.319318 -0.848077 0.605965 0.331800
2 C -2.018168 0.740122 0.528813 -0.589001 -1.278046
3 D 0.188695 -0.758872 -0.933237 0.955057 -0.570177
4 E 0.190794 1.978757 2.605967 0.683509 2.169552
In [85]:
newind = 'CA NY WY OR my '.split()
In [86]:
newind
Out[86]:
['CA', 'NY', 'WY', 'OR', 'my']
In [88]:
df['states']=newind
In [89]:
df
Out[89]:
W X Y Z new states
A 2.706850 0.628133 0.907969 0.503826 3.334983 CA
B 0.651118 -0.319318 -0.848077 0.605965 0.331800 NY
C -2.018168 0.740122 0.528813 -0.589001 -1.278046 WY
D 0.188695 -0.758872 -0.933237 0.955057 -0.570177 OR
E 0.190794 1.978757 2.605967 0.683509 2.169552 my
In [90]:
df.set_index('X')
Out[90]:
W Y Z new states
X
0.628133 2.706850 0.907969 0.503826 3.334983 CA
-0.319318 0.651118 -0.848077 0.605965 0.331800 NY
0.740122 -2.018168 0.528813 -0.589001 -1.278046 WY
-0.758872 0.188695 -0.933237 0.955057 -0.570177 OR
1.978757 0.190794 2.605967 0.683509 2.169552 my
In [91]:
df
Out[91]:
W X Y Z new states
A 2.706850 0.628133 0.907969 0.503826 3.334983 CA
B 0.651118 -0.319318 -0.848077 0.605965 0.331800 NY
C -2.018168 0.740122 0.528813 -0.589001 -1.278046 WY
D 0.188695 -0.758872 -0.933237 0.955057 -0.570177 OR
E 0.190794 1.978757 2.605967 0.683509 2.169552 my

pandas multiindex¶

In [95]:
df=pd.DataFrame(randn(6,2),[1,2,3,4,5,6],[1,2])
In [96]:
df
Out[96]:
1 2
1 1.025984 -0.156598
2 -0.031579 0.649826
3 2.154846 -0.610259
4 -0.755325 -0.346419
5 0.147027 -0.479448
6 0.558769 1.024810
In [97]:
df.index.names=['ddd']
In [98]:
df
Out[98]:
1 2
ddd
1 1.025984 -0.156598
2 -0.031579 0.649826
3 2.154846 -0.610259
4 -0.755325 -0.346419
5 0.147027 -0.479448
6 0.558769 1.024810
In [107]:
d={'a':[1,2,np.nan],'b':[3,np.nan,5],'c':[6,7,8]}
In [108]:
pd.Series(d)
Out[108]:
a    [1, 2, nan]
b    [3, nan, 5]
c      [6, 7, 8]
dtype: object
In [112]:
df=pd.DataFrame(d)
In [113]:
df
Out[113]:
a b c
0 1.0 3.0 6
1 2.0 NaN 7
2 NaN 5.0 8
In [21]:
df.dropna(axis=1)
Out[21]:
a b
groups num
g1 1 0.025110 1.681811
2 0.373285 0.496837
3 -0.241096 -1.034506
g2 1 0.931957 1.896131
2 -2.096678 1.204199
3 -1.670785 0.565982
In [115]:
df.dropna(thresh=2)  #dropna function
Out[115]:
a b c
0 1.0 3.0 6
1 2.0 NaN 7
2 NaN 5.0 8
In [119]:
df.fillna(value='k')  #fillna fuction
Out[119]:
a b c
0 1 3 6
1 2 k 7
2 k 5 8
In [123]:
bb=df.groupby('c')
In [124]:
bb
Out[124]:
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000162A0732A48>

bb.mean()

multiindex¶

In [3]:
import numpy as np
import pandas as pd
outside=['g1','g1','g1','g2','g2','g2']
inside=[1,2,3,1,2,3]
a=list(zip(outside,inside))
a=pd.MultiIndex.from_tuples(a)
In [4]:
a
Out[4]:
MultiIndex([('g1', 1),
            ('g1', 2),
            ('g1', 3),
            ('g2', 1),
            ('g2', 2),
            ('g2', 3)],
           )
In [5]:
outside
Out[5]:
['g1', 'g1', 'g1', 'g2', 'g2', 'g2']
In [6]:
list(zip(outside,inside))
Out[6]:
[('g1', 1), ('g1', 2), ('g1', 3), ('g2', 1), ('g2', 2), ('g2', 3)]
In [9]:
from numpy.random import randn
df=pd.DataFrame(randn(6,2),a,['a','b'])
In [10]:
df
Out[10]:
a b
g1 1 0.025110 1.681811
2 0.373285 0.496837
3 -0.241096 -1.034506
g2 1 0.931957 1.896131
2 -2.096678 1.204199
3 -1.670785 0.565982
In [12]:
df.loc['g1']
Out[12]:
a b
1 0.025110 1.681811
2 0.373285 0.496837
3 -0.241096 -1.034506
In [13]:
df.index.names=['groups','num']
In [14]:
df
Out[14]:
a b
groups num
g1 1 0.025110 1.681811
2 0.373285 0.496837
3 -0.241096 -1.034506
g2 1 0.931957 1.896131
2 -2.096678 1.204199
3 -1.670785 0.565982
In [18]:
df.loc['g2'].loc[2]['a']
Out[18]:
-2.0966780980673736
In [20]:
df.xs('g1')
Out[20]:
a b
num
1 0.025110 1.681811
2 0.373285 0.496837
3 -0.241096 -1.034506

pandas groupby function¶

In [32]:
data={'compony':['google','microsoft','apple','jio','itc','jio'],
      'person':['jay','jash','ashais','anurag','unnati','mansis'],
      'sales':[200,150,123,234,455,566]}
In [33]:
datas=pd.DataFrame(data)
In [34]:
datas
Out[34]:
compony person sales
0 google jay 200
1 microsoft jash 150
2 apple ashais 123
3 jio anurag 234
4 itc unnati 455
5 jio mansis 566
In [35]:
df=datas.groupby('compony')
In [38]:
df.mean()
Out[38]:
sales
compony
apple 123
google 200
itc 455
jio 400
microsoft 150
In [37]:
df.sum()
Out[37]:
sales
compony
apple 123
google 200
itc 455
jio 800
microsoft 150
In [39]:
df.std()
Out[39]:
sales
compony
apple NaN
google NaN
itc NaN
jio 234.759451
microsoft NaN
In [41]:
df.describe()
Out[41]:
sales
count mean std min 25% 50% 75% max
compony
apple 1.0 123.0 NaN 123.0 123.0 123.0 123.0 123.0
google 1.0 200.0 NaN 200.0 200.0 200.0 200.0 200.0
itc 1.0 455.0 NaN 455.0 455.0 455.0 455.0 455.0
jio 2.0 400.0 234.759451 234.0 317.0 400.0 483.0 566.0
microsoft 1.0 150.0 NaN 150.0 150.0 150.0 150.0 150.0

operation¶

In [42]:
df1=pd.DataFrame({'col1':[1,2,3,4],'col2':[333,444,555,666],'col3':['abc','gss','dgf','dsg']})
In [43]:
df1
Out[43]:
col1 col2 col3
0 1 333 abc
1 2 444 gss
2 3 555 dgf
3 4 666 dsg
In [44]:
df1.head()
Out[44]:
col1 col2 col3
0 1 333 abc
1 2 444 gss
2 3 555 dgf
3 4 666 dsg
In [48]:
df1['col2'].unique()
Out[48]:
array([333, 444, 555, 666], dtype=int64)
In [49]:
df1['col2'].nunique()
Out[49]:
4
In [50]:
df1['col2'].value_counts()
Out[50]:
333    1
444    1
555    1
666    1
Name: col2, dtype: int64
In [53]:
def times2(x):
    return x**2
In [56]:
df1['col1'].apply(times2)
Out[56]:
0     1
1     4
2     9
3    16
Name: col1, dtype: int64
In [57]:
df1['col1'].apply(lambda x:x+2)
Out[57]:
0    3
1    4
2    5
3    6
Name: col1, dtype: int64
In [59]:
df1.columns
Out[59]:
Index(['col1', 'col2', 'col3'], dtype='object')
In [60]:
df1.index
Out[60]:
RangeIndex(start=0, stop=4, step=1)
In [62]:
df1.sort_values('col2')
Out[62]:
col1 col2 col3
0 1 333 abc
1 2 444 gss
2 3 555 dgf
3 4 666 dsg
In [64]:
df1.isnull()
Out[64]:
col1 col2 col3
0 False False False
1 False False False
2 False False False
3 False False False

data input and output¶

1:csv¶

2:excel¶

3:html¶

4:sql¶

In [1]:
import pandas as pd
In [2]:
pwd
Out[2]:
'C:\\Users\\hp'
In [3]:
pd.read_csv('C:\Users\hp\Downloads\annual-enterprise-survey-2018-financial-year-provisional-csv (1).csv') #read csv file
  File "<ipython-input-3-94b9d53f99d2>", line 1
    pd.read_csv('C:\Users\hp\Downloads\annual-enterprise-survey-2018-financial-year-provisional-csv (1).csv') #read csv file
               ^
SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape
In [54]:
pd.read_csv("C:\Users\hp\seaborn-data\flights.csv",encoding="ISO=8859-1") #read csv file
  File "<ipython-input-54-062f95b4a9a0>", line 1
    pd.read_csv("C:\Users\hp\seaborn-data\flights.csv",encoding="ISO=8859-1") #read csv file
               ^
SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape
In [51]:
#pd.read_excel('filename')
In [74]:
#df.to_excel("name",shheet_name='')
In [76]:
#data=pd.read_html('')

Data Visulization: MATPLOTLIB¶

In [56]:
import matplotlib.pyplot as plt
In [57]:
%matplotlib inline
In [58]:
plt.show()
In [59]:
import numpy as np
x=np.linspace(0,5,11)
y=x**2
In [60]:
y
Out[60]:
array([ 0.  ,  0.25,  1.  ,  2.25,  4.  ,  6.25,  9.  , 12.25, 16.  ,
       20.25, 25.  ])
In [61]:
x
Out[61]:
array([0. , 0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])
In [62]:
#functional method
plt.plot(x,y)
plt.xlabel('X label')
plt.ylabel('y label')
plt.title('corona')
plt.show()
In [63]:
plt.subplot(1,2,1) #rows,col,plotno
plt.plot(x,y,'r')
plt.subplot(1,2,2)
plt.plot(y,x,'b')
Out[63]:
[<matplotlib.lines.Line2D at 0x24e2d26c808>]
In [64]:
#object oriented method
fig=plt.figure()
axes=fig.add_axes([0.1,0.1,0.8,0.8]) #left,bottom,width and height
axes.plot(x,y)
axes.set_xlabel('xlabel')
axes.set_ylabel('ylabel')
axes.set_title('title')
Out[64]:
Text(0.5, 1.0, 'title')
In [65]:
fig=plt.figure()
axes1=fig.add_axes([0.1,0.1,0.8,0.8])

axes1.plot(x,y**2,label='ycube')
axes1.plot(y,x**3,label='x cube')
axes1.legend(loc=10)  #location from 0.....10
Out[65]:
<matplotlib.legend.Legend at 0x24e2d1ce088>
In [66]:
fig,axes=plt.subplots(nrows=1,ncols=2)
# axes.plot(x)
for current in axes:
    current.plot(x,y)
In [47]:
fig,axes=plt.subplots(nrows=1,ncols=2)
# axes.plot(x)
# for current in axes:
#     current.plot(x,y)
axes[0].plot(x,y)
axes[1].plot(y,x)
Out[47]:
[<matplotlib.lines.Line2D at 0x24a666f9408>]

# figure size and DPI¶

In [52]:
fig=plt.figure(figsize=(8,2))#width,height
ax=fig.add_axes([0,0,1,1])
ax.plot(x,y)
Out[52]:
[<matplotlib.lines.Line2D at 0x24a664c5b08>]
In [63]:
fig.savefig('wallpaper3.jpg',dpi=200)
In [91]:
fig=plt.figure()
ax=fig.add_axes([0,0,1,1])
ax.plot(x,y,color='green',linewidth=4,linestyle='-',marker='o',markersize=10,markerfacecolor='yellow') #and also rgb hex code we get variety of color
ax.set_xlim([0,1])
ax.set_ylim([0,1])
Out[91]:
(0, 1)
In [80]:
x
Out[80]:
array([0. , 0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])
In [92]:
plt.scatter(x,y)
Out[92]:
<matplotlib.collections.PathCollection at 0x24a68c665c8>

seaboarn library¶

In [22]:
import seaborn as sns
In [23]:
%matplotlib inline
In [24]:
tips=sns.load_dataset('tips')
In [25]:
tips.head()
Out[25]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [31]:
sns.distplot(tips['total_bill'],kde=False,bins=100)
Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x242800a1948>
In [27]:
sns.jointplot(x='total_bill',y='tip',data=tips,kind='kde')
Out[27]:
<seaborn.axisgrid.JointGrid at 0x242ff9a4c48>
In [28]:
sns.pairplot(tips,hue='sex',palette='coolwarm')
Out[28]:
<seaborn.axisgrid.PairGrid at 0x242ffa9f8c8>
In [29]:
sns.rugplot(tips['total_bill'])
Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x242fff59448>
In [32]:
sns.distplot(tips['total_bill'])
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x242803e8688>
In [33]:
sns.pairplot(tips)
Out[33]:
<seaborn.axisgrid.PairGrid at 0x2428043fe48>
In [35]:
sns.jointplot(x='total_bill',y='tip',data=tips)
Out[35]:
<seaborn.axisgrid.JointGrid at 0x242819d2248>

categorial plot¶

In [38]:
import seaborn as sns
%matplotlib inline
tips=sns.load_dataset('tips')
tips.head()
Out[38]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [42]:
import numpy as np
sns.barplot(x='size',y='total_bill',data=tips,estimator=np.std)
Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x24281d95208>
In [43]:
sns.countplot(x='sex',data=tips)
Out[43]:
<matplotlib.axes._subplots.AxesSubplot at 0x24281e10a88>
In [46]:
sns.boxplot(x='day',y='total_bill',data=tips,hue='smoker')
Out[46]:
<matplotlib.axes._subplots.AxesSubplot at 0x24281eafd48>
In [50]:
sns.violinplot(x='day',y='total_bill',data=tips,hue='sex',split=True)
Out[50]:
<matplotlib.axes._subplots.AxesSubplot at 0x242821b3b88>
In [53]:
sns.stripplot(x='day',y='total_bill',data=tips,hue='sex',split=True)
C:\Users\hp\Anaconda3\lib\site-packages\seaborn\categorical.py:2775: UserWarning: The `split` parameter has been renamed to `dodge`.
  warnings.warn(msg, UserWarning)
Out[53]:
<matplotlib.axes._subplots.AxesSubplot at 0x24282398688>
In [58]:
sns.violinplot(x='day',y='total_bill',data=tips)
sns.swarmplot(x='day',y='total_bill',data=tips,color='black')
Out[58]:
<matplotlib.axes._subplots.AxesSubplot at 0x2428358e108>
In [61]:
sns.factorplot(x='day',y='total_bill',data=tips,kind='bar')  #general ploting function
Out[61]:
<seaborn.axisgrid.FacetGrid at 0x24283658348>
In [62]:
import seaborn as sns
%matplotlib inline
tips=sns.load_dataset('tips')
flights=sns.load_dataset('flights')

tips.head()
Out[62]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [63]:
flights.head()
Out[63]:
year month passengers
0 1949 January 112
1 1949 February 118
2 1949 March 132
3 1949 April 129
4 1949 May 121
In [67]:
tc=tips.corr()
In [69]:
sns.heatmap(tc)
Out[69]:
<matplotlib.axes._subplots.AxesSubplot at 0x2428384f908>
In [70]:
tc
Out[70]:
total_bill tip size
total_bill 1.000000 0.675734 0.598315
tip 0.675734 1.000000 0.489299
size 0.598315 0.489299 1.000000

matrix plot¶

In [5]:
import pandas as pd
import seaborn as sns
import matplotlib
%matplotlib inline
tips=sns.load_dataset('tips')
flights=sns.load_dataset('flights')
tips.head()
Out[5]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [6]:
flights.head()
Out[6]:
year month passengers
0 1949 January 112
1 1949 February 118
2 1949 March 132
3 1949 April 129
4 1949 May 121
In [8]:
tc=tips.corr()
In [12]:
sns.heatmap(tc,annot=True,cmap='coolwarm')
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x287eba66348>
In [10]:
tc
Out[10]:
total_bill tip size
total_bill 1.000000 0.675734 0.598315
tip 0.675734 1.000000 0.489299
size 0.598315 0.489299 1.000000
In [13]:
flights
Out[13]:
year month passengers
0 1949 January 112
1 1949 February 118
2 1949 March 132
3 1949 April 129
4 1949 May 121
... ... ... ...
139 1960 August 606
140 1960 September 508
141 1960 October 461
142 1960 November 390
143 1960 December 432

144 rows × 3 columns

In [17]:
fp=flights.pivot_table(index='month',columns='year',values='passengers')
In [24]:
sns.heatmap(fp,cmap='magma',linecolor='white',linewidth=5,annot=True)
Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x287ec052188>
In [27]:
sns.clustermap(fp,cmap='coolwarm',standard_scale=1)
Out[27]:
<seaborn.matrix.ClusterGrid at 0x287ec815648>

grids¶

In [28]:
iris=sns.load_dataset('iris')
In [30]:
iris.head()
Out[30]:
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
In [45]:
import matplotlib.pyplot as plt
%matplotlib inline
g=sns.PairGrid(iris)
g.map_diag(sns.distplot)
g.map_upper(plt.scatter)
g.map_lower(sns.kdeplot)
Out[45]:
<seaborn.axisgrid.PairGrid at 0x287f22a9788>
In [46]:
tips.head()
Out[46]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [50]:
g=sns.FacetGrid(data=tips,col='time',row='smoker')
g.map(sns.distplot,'total_bill')
Out[50]:
<seaborn.axisgrid.FacetGrid at 0x287f3941d88>

regression plot¶

In [55]:
sns.lmplot(x='total_bill',y='tip',data=tips,hue='sex',markers=['o','v'])
Out[55]:
<seaborn.axisgrid.FacetGrid at 0x287f38a6148>
In [60]:
sns.lmplot(x='total_bill',y='tip',data=tips,col='day')
Out[60]:
<seaborn.axisgrid.FacetGrid at 0x287f4c71e88>

pandas built-in data visulization¶

In [1]:
import numpy as np 
In [2]:
import pandas as pd
In [7]:
%matplotlib inline
In [8]:
import seaborn as sns
In [9]:
tips=sns.load_dataset('tips',index_col=0)
In [11]:
tips.head()
Out[11]:
tip sex smoker day time size
total_bill
16.99 1.01 Female No Sun Dinner 2
10.34 1.66 Male No Sun Dinner 3
21.01 3.50 Male No Sun Dinner 3
23.68 3.31 Male No Sun Dinner 2
24.59 3.61 Female No Sun Dinner 4
In [15]:
tips['tip'].hist(bins=20)   #histogram graph
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d8c3ec3d08>
In [19]:
tips['tip'].plot(kind='hist',bins=20)   #overall method to plot graph
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d8c40c46c8>
In [25]:
tips.plot.bar(stacked=True)
Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d8c4b83788>
In [33]:
tips.plot.line(x='tip',y='size',figsize=(12,3),lw=1)
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d8c5521608>
In [39]:
tips.plot.scatter(x='size',y='tip')
Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d8c65e74c8>
In [40]:
tips.plot.box()
Out[40]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d8c6648488>
In [ ]:
 
In [42]:
d=pd.DataFrame(np.random.randn(1000,2),columns=['a','b'])
In [43]:
d.head()
Out[43]:
a b
0 0.158794 -1.032904
1 -0.610498 -0.572200
2 -2.080291 -0.515329
3 0.145124 -0.900696
4 0.798563 1.357391
In [46]:
d.plot.hexbin(x='a',y='b',gridsize=25,cmap='coolwarm')
Out[46]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d8c67e2288>
In [48]:
d.plot.kde()
Out[48]:
<matplotlib.axes._subplots.AxesSubplot at 0x1d8c6922208>

plotly and cufflinks¶

In [1]:
import pandas as pd
import numpy as np
In [ ]:
 
In [2]:
from plotly import __version__
In [3]:
print(__version__)
4.7.0
In [4]:
import cufflinks as cf
In [5]:
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
In [6]:
init_notebook_mode(connected=True)
In [7]:
cf.go_offline()
In [8]:
df=pd.DataFrame(np.random.randn(100,4),columns=['a','b','c','d'])
In [9]:
df.head()
Out[9]:
a b c d
0 -0.617626 0.950844 -0.826805 0.306526
1 -0.485076 -0.962058 -1.220142 1.153995
2 1.008816 0.692163 -1.303469 0.271443
3 0.987260 1.523529 1.827371 -1.419191
4 -0.418931 -0.222241 0.040562 -2.453487
In [10]:
df1=pd.DataFrame({'category':['a','b','c'],'values':[23,43,54]})
In [11]:
df1
Out[11]:
category values
0 a 23
1 b 43
2 c 54
In [12]:
%matplotlib inline
In [13]:
df.iplot()
In [14]:
df.iplot(kind='scatter',x='a',y='b')
−2.5−2−1.5−1−0.500.511.52−2−1012Export to plot.ly »
In [ ]:
 
In [15]:
df1.iplot(kind='bar',x='category',y='values')
In [16]:
df.sum().iplot(kind='bar')
abcd−16−14−12−10−8−6−4−202Export to plot.ly »
In [17]:
df.iplot(kind='box')
In [18]:
df2=pd.DataFrame({'x':[1,2,3,4,5],'y':[10,20,30,20,10],'z':[5,4,3,2,1]})
In [ ]:
 
In [19]:
df2
Out[19]:
x y z
0 1 10 5
1 2 20 4
2 3 30 3
3 4 20 2
4 5 10 1
In [20]:
df2.iplot(kind='surface',colorscale='rdylbu')
In [21]:
df.iplot(kind='hist')
In [22]:
df[['a','b']].iplot(kind='spread')
In [23]:
df.iplot(kind='bubble',x='a',y='b',size='c')
In [24]:
df.scatter_matrix()
−20201020−202−202−202−202−202−202−202−202−202010203040−202−202−202−202−202−202−202−202−20205101520−202−202−202−202−202−202−202−202−20205101520Export to plot.ly »

geographical plot¶

chloropeth map¶

In [25]:
import chart_studio.plotly as py
In [26]:
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
In [27]:
init_notebook_mode(connected=True)
In [28]:
import plotly.graph_objs as go
In [29]:
data=dict(type='choropleth',locations=['AZ','CA','NY'],locationmode='USA-states',colorscale='portland',
         text=['text1','text2','text3'],z=[1.0,2.0,3.0],colorbar={'title':'colorbar title goes here'})
In [30]:
data
Out[30]:
{'type': 'choropleth',
 'locations': ['AZ', 'CA', 'NY'],
 'locationmode': 'USA-states',
 'colorscale': 'portland',
 'text': ['text1', 'text2', 'text3'],
 'z': [1.0, 2.0, 3.0],
 'colorbar': {'title': 'colorbar title goes here'}}
In [31]:
layout=dict(geo={'scope':'usa'})
In [32]:
choromap=go.Figure(data=[data],layout=layout)
In [33]:
iplot(choromap)
In [34]:
import pandas as pd
# df=pd.read_csv('2011_US_AGRI_EXPORTS')
In [35]:
# df=pd.read_excel("C:\\Users\\hp\\Downloads\\WEO_Data (1).xls")

Machine Learning with python¶

In [36]:
#from sklearn.family import model(example:model should be Linear Regression)
In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [38]:
%matplotlib inline
In [39]:
df=pd.read_csv('file:///C:/Users/hp/AppData/Local/Temp/Temp1_usa-housing.zip/USA_Housing.csv')
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
~\Anaconda3\lib\urllib\request.py in open_local_file(self, req)
   1472         try:
-> 1473             stats = os.stat(localfile)
   1474             size = stats.st_size

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\hp\\AppData\\Local\\Temp\\Temp1_usa-housing.zip\\USA_Housing.csv'

During handling of the above exception, another exception occurred:

URLError                                  Traceback (most recent call last)
<ipython-input-39-53a044993694> in <module>
----> 1 df=pd.read_csv('file:///C:/Users/hp/AppData/Local/Temp/Temp1_usa-housing.zip/USA_Housing.csv')

~\Anaconda3\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    683         )
    684 
--> 685         return _read(filepath_or_buffer, kwds)
    686 
    687     parser_f.__name__ = name

~\Anaconda3\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
    438     # See https://github.com/python/mypy/issues/1297
    439     fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
--> 440         filepath_or_buffer, encoding, compression
    441     )
    442     kwds["compression"] = compression

~\Anaconda3\lib\site-packages\pandas\io\common.py in get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode)
    194 
    195     if _is_url(filepath_or_buffer):
--> 196         req = urlopen(filepath_or_buffer)
    197         content_encoding = req.headers.get("Content-Encoding", None)
    198         if content_encoding == "gzip":

~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    220     else:
    221         opener = _opener
--> 222     return opener.open(url, data, timeout)
    223 
    224 def install_opener(opener):

~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
    523             req = meth(req)
    524 
--> 525         response = self._open(req, data)
    526 
    527         # post-process response

~\Anaconda3\lib\urllib\request.py in _open(self, req, data)
    541         protocol = req.type
    542         result = self._call_chain(self.handle_open, protocol, protocol +
--> 543                                   '_open', req)
    544         if result:
    545             return result

~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    501         for handler in handlers:
    502             func = getattr(handler, meth_name)
--> 503             result = func(*args)
    504             if result is not None:
    505                 return result

~\Anaconda3\lib\urllib\request.py in file_open(self, req)
   1449                 raise URLError("file:// scheme is supported only on localhost")
   1450         else:
-> 1451             return self.open_local_file(req)
   1452 
   1453     # names for the localhost

~\Anaconda3\lib\urllib\request.py in open_local_file(self, req)
   1488                 return addinfourl(open(localfile, 'rb'), headers, origurl)
   1489         except OSError as exp:
-> 1490             raise URLError(exp)
   1491         raise URLError('file not on local host')
   1492 

URLError: <urlopen error [WinError 3] The system cannot find the path specified: 'C:\\Users\\hp\\AppData\\Local\\Temp\\Temp1_usa-housing.zip\\USA_Housing.csv'>
In [ ]:
df
In [ ]:
df.head()
In [ ]:
df.info()
In [40]:
df.columns
Out[40]:
Index(['a', 'b', 'c', 'd'], dtype='object')
In [41]:
sns.pairplot(df)
Out[41]:
<seaborn.axisgrid.PairGrid at 0x25b90307548>
In [42]:
sns.distplot(df['Price'])
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2896             try:
-> 2897                 return self._engine.get_loc(key)
   2898             except KeyError:

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Price'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-42-87e11caeb2c4> in <module>
----> 1 sns.distplot(df['Price'])

~\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2978             if self.columns.nlevels > 1:
   2979                 return self._getitem_multilevel(key)
-> 2980             indexer = self.columns.get_loc(key)
   2981             if is_integer(indexer):
   2982                 indexer = [indexer]

~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2897                 return self._engine.get_loc(key)
   2898             except KeyError:
-> 2899                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2900         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2901         if indexer.ndim > 1 or indexer.size > 1:

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Price'
In [ ]:
df.corr()
In [ ]:
sns.heatmap(df.corr(),annot=True)
In [ ]:
df.columns
In [ ]:
X=df[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population']]
In [ ]:
y=df['Price']
In [ ]:
from sklearn.model_selection import train_test_split
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)
In [ ]:
from sklearn.linear_model import LinearRegression
In [ ]:
lm=LinearRegression()

lm.fit(X_train,y_train)

In [ ]:
print(lm.intercept_)
In [ ]:
lm.coef_
In [ ]:
X_train.columns
In [ ]:
CDF=pd.DataFrame(lm.coef_,X.columns,columns=['Coeff'])
In [ ]:
CDF
In [ ]:
from sklearn.datasets import load_boston
In [ ]:
boston=load_boston()
In [ ]:
boston.keys()
In [ ]:
print(boston['DESCR'])
In [ ]:
predictions=lm.predict(X_test)
In [ ]:
predictions
In [ ]:
y_test
In [ ]:
plt.scatter(y_test,predictions)
In [ ]:
sns.distplot((y_test-predictions))
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

Comments