DATA ANALYSIS:PANDAS¶
In [3]:
import numpy as np
In [4]:
import pandas as pd
In [5]:
labels=['a','b','c']
my_data=[10,20,30]
arr=np.array(my_data)
d={'a':10,'b':20,'c':30}
In [6]:
arr
Out[6]:
pandas series¶
In [7]:
pd.Series(data=my_data)
Out[7]:
In [11]:
pd.Series(data=my_data,index=labels)
Out[11]:
In [12]:
pd.Series(d)
Out[12]:
In [13]:
pd.Series(arr)
Out[13]:
In [14]:
pd.Series(labels)
Out[14]:
In [16]:
pd.Series([sum,print,len])
Out[16]:
In [19]:
a=pd.Series([1,2,3,4],['usa','india','china','ussr'])
In [20]:
a
Out[20]:
In [7]:
b=pd.Series(data=[1,2,3,4],index=['delhi','mumbai','goa','imphaql'])
In [8]:
b
Out[8]:
In [10]:
b['delhi']
Out[10]:
In [27]:
c=pd.Series(data=labels)
In [28]:
c
Out[28]:
In [30]:
c[0]
Out[30]:
In [33]:
a+b
Out[33]:
pandas dataframes¶
In [22]:
from numpy.random import randn
In [23]:
np.random.seed(101)
In [24]:
df=pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z']) #DATA,INDEX AND COLUMN
In [25]:
df
Out[25]:
In [26]:
df['W']
Out[26]:
In [27]:
type(df['W'])
Out[27]:
In [28]:
type(df)
Out[28]:
In [29]:
df.W
Out[29]:
In [30]:
df[['W','X']]
Out[30]:
In [31]:
df['new']=df['W']+df['X']
In [32]:
df
Out[32]:
In [33]:
df.drop('new',axis=1)
Out[33]:
In [36]:
df.drop('E',axis=0)
Out[36]:
In [37]:
df.shape
Out[37]:
In [38]:
df
Out[38]:
In [39]:
df.loc['A'] #or df.iloc[index]
Out[39]:
In [40]:
df.loc['A','X']
Out[40]:
In [41]:
bool=df>0
In [42]:
df[bool]
Out[42]:
In [43]:
df
Out[43]:
In [67]:
df['W']>0
Out[67]:
In [68]:
df[df['W']>0]
Out[68]:
In [69]:
df[df['W']>0]['X']
Out[69]:
In [70]:
arr=df>0
In [73]:
df[arr]
Out[73]:
In [78]:
df[(df['W']>0) & (df['X']<1)]
Out[78]:
In [80]:
df.reset_index()
Out[80]:
In [85]:
newind = 'CA NY WY OR my '.split()
In [86]:
newind
Out[86]:
In [88]:
df['states']=newind
In [89]:
df
Out[89]:
In [90]:
df.set_index('X')
Out[90]:
In [91]:
df
Out[91]:
pandas multiindex¶
In [95]:
df=pd.DataFrame(randn(6,2),[1,2,3,4,5,6],[1,2])
In [96]:
df
Out[96]:
In [97]:
df.index.names=['ddd']
In [98]:
df
Out[98]:
In [107]:
d={'a':[1,2,np.nan],'b':[3,np.nan,5],'c':[6,7,8]}
In [108]:
pd.Series(d)
Out[108]:
In [112]:
df=pd.DataFrame(d)
In [113]:
df
Out[113]:
In [21]:
df.dropna(axis=1)
Out[21]:
In [115]:
df.dropna(thresh=2) #dropna function
Out[115]:
In [119]:
df.fillna(value='k') #fillna fuction
Out[119]:
In [123]:
bb=df.groupby('c')
In [124]:
bb
Out[124]:
bb.mean()
multiindex¶
In [3]:
import numpy as np
import pandas as pd
outside=['g1','g1','g1','g2','g2','g2']
inside=[1,2,3,1,2,3]
a=list(zip(outside,inside))
a=pd.MultiIndex.from_tuples(a)
In [4]:
a
Out[4]:
In [5]:
outside
Out[5]:
In [6]:
list(zip(outside,inside))
Out[6]:
In [9]:
from numpy.random import randn
df=pd.DataFrame(randn(6,2),a,['a','b'])
In [10]:
df
Out[10]:
In [12]:
df.loc['g1']
Out[12]:
In [13]:
df.index.names=['groups','num']
In [14]:
df
Out[14]:
In [18]:
df.loc['g2'].loc[2]['a']
Out[18]:
In [20]:
df.xs('g1')
Out[20]:
pandas groupby function¶
In [32]:
data={'compony':['google','microsoft','apple','jio','itc','jio'],
'person':['jay','jash','ashais','anurag','unnati','mansis'],
'sales':[200,150,123,234,455,566]}
In [33]:
datas=pd.DataFrame(data)
In [34]:
datas
Out[34]:
In [35]:
df=datas.groupby('compony')
In [38]:
df.mean()
Out[38]:
In [37]:
df.sum()
Out[37]:
In [39]:
df.std()
Out[39]:
In [41]:
df.describe()
Out[41]:
operation¶
In [42]:
df1=pd.DataFrame({'col1':[1,2,3,4],'col2':[333,444,555,666],'col3':['abc','gss','dgf','dsg']})
In [43]:
df1
Out[43]:
In [44]:
df1.head()
Out[44]:
In [48]:
df1['col2'].unique()
Out[48]:
In [49]:
df1['col2'].nunique()
Out[49]:
In [50]:
df1['col2'].value_counts()
Out[50]:
In [53]:
def times2(x):
return x**2
In [56]:
df1['col1'].apply(times2)
Out[56]:
In [57]:
df1['col1'].apply(lambda x:x+2)
Out[57]:
In [59]:
df1.columns
Out[59]:
In [60]:
df1.index
Out[60]:
In [62]:
df1.sort_values('col2')
Out[62]:
In [64]:
df1.isnull()
Out[64]:
data input and output¶
In [1]:
import pandas as pd
In [2]:
pwd
Out[2]:
In [3]:
pd.read_csv('C:\Users\hp\Downloads\annual-enterprise-survey-2018-financial-year-provisional-csv (1).csv') #read csv file
In [54]:
pd.read_csv("C:\Users\hp\seaborn-data\flights.csv",encoding="ISO=8859-1") #read csv file
In [51]:
#pd.read_excel('filename')
In [74]:
#df.to_excel("name",shheet_name='')
In [76]:
#data=pd.read_html('')
Data Visulization: MATPLOTLIB¶
In [56]:
import matplotlib.pyplot as plt
In [57]:
%matplotlib inline
In [58]:
plt.show()
In [59]:
import numpy as np
x=np.linspace(0,5,11)
y=x**2
In [60]:
y
Out[60]:
In [61]:
x
Out[61]:
In [62]:
#functional method
plt.plot(x,y)
plt.xlabel('X label')
plt.ylabel('y label')
plt.title('corona')
plt.show()
In [63]:
plt.subplot(1,2,1) #rows,col,plotno
plt.plot(x,y,'r')
plt.subplot(1,2,2)
plt.plot(y,x,'b')
Out[63]:
In [64]:
#object oriented method
fig=plt.figure()
axes=fig.add_axes([0.1,0.1,0.8,0.8]) #left,bottom,width and height
axes.plot(x,y)
axes.set_xlabel('xlabel')
axes.set_ylabel('ylabel')
axes.set_title('title')
Out[64]:
In [65]:
fig=plt.figure()
axes1=fig.add_axes([0.1,0.1,0.8,0.8])
axes1.plot(x,y**2,label='ycube')
axes1.plot(y,x**3,label='x cube')
axes1.legend(loc=10) #location from 0.....10
Out[65]:
In [66]:
fig,axes=plt.subplots(nrows=1,ncols=2)
# axes.plot(x)
for current in axes:
current.plot(x,y)
In [47]:
fig,axes=plt.subplots(nrows=1,ncols=2)
# axes.plot(x)
# for current in axes:
# current.plot(x,y)
axes[0].plot(x,y)
axes[1].plot(y,x)
Out[47]:
# figure size and DPI¶
In [52]:
fig=plt.figure(figsize=(8,2))#width,height
ax=fig.add_axes([0,0,1,1])
ax.plot(x,y)
Out[52]:
In [63]:
fig.savefig('wallpaper3.jpg',dpi=200)
In [91]:
fig=plt.figure()
ax=fig.add_axes([0,0,1,1])
ax.plot(x,y,color='green',linewidth=4,linestyle='-',marker='o',markersize=10,markerfacecolor='yellow') #and also rgb hex code we get variety of color
ax.set_xlim([0,1])
ax.set_ylim([0,1])
Out[91]:
In [80]:
x
Out[80]:
In [92]:
plt.scatter(x,y)
Out[92]:
seaboarn library¶
In [22]:
import seaborn as sns
In [23]:
%matplotlib inline
In [24]:
tips=sns.load_dataset('tips')
In [25]:
tips.head()
Out[25]:
In [31]:
sns.distplot(tips['total_bill'],kde=False,bins=100)
Out[31]:
In [27]:
sns.jointplot(x='total_bill',y='tip',data=tips,kind='kde')
Out[27]:
In [28]:
sns.pairplot(tips,hue='sex',palette='coolwarm')
Out[28]:
In [29]:
sns.rugplot(tips['total_bill'])
Out[29]:
In [32]:
sns.distplot(tips['total_bill'])
Out[32]:
In [33]:
sns.pairplot(tips)
Out[33]:
In [35]:
sns.jointplot(x='total_bill',y='tip',data=tips)
Out[35]:
categorial plot¶
In [38]:
import seaborn as sns
%matplotlib inline
tips=sns.load_dataset('tips')
tips.head()
Out[38]:
In [42]:
import numpy as np
sns.barplot(x='size',y='total_bill',data=tips,estimator=np.std)
Out[42]:
In [43]:
sns.countplot(x='sex',data=tips)
Out[43]:
In [46]:
sns.boxplot(x='day',y='total_bill',data=tips,hue='smoker')
Out[46]:
In [50]:
sns.violinplot(x='day',y='total_bill',data=tips,hue='sex',split=True)
Out[50]:
In [53]:
sns.stripplot(x='day',y='total_bill',data=tips,hue='sex',split=True)
Out[53]:
In [58]:
sns.violinplot(x='day',y='total_bill',data=tips)
sns.swarmplot(x='day',y='total_bill',data=tips,color='black')
Out[58]:
In [61]:
sns.factorplot(x='day',y='total_bill',data=tips,kind='bar') #general ploting function
Out[61]:
In [62]:
import seaborn as sns
%matplotlib inline
tips=sns.load_dataset('tips')
flights=sns.load_dataset('flights')
tips.head()
Out[62]:
In [63]:
flights.head()
Out[63]:
In [67]:
tc=tips.corr()
In [69]:
sns.heatmap(tc)
Out[69]:
In [70]:
tc
Out[70]:
matrix plot¶
In [5]:
import pandas as pd
import seaborn as sns
import matplotlib
%matplotlib inline
tips=sns.load_dataset('tips')
flights=sns.load_dataset('flights')
tips.head()
Out[5]:
In [6]:
flights.head()
Out[6]:
In [8]:
tc=tips.corr()
In [12]:
sns.heatmap(tc,annot=True,cmap='coolwarm')
Out[12]:
In [10]:
tc
Out[10]:
In [13]:
flights
Out[13]:
In [17]:
fp=flights.pivot_table(index='month',columns='year',values='passengers')
In [24]:
sns.heatmap(fp,cmap='magma',linecolor='white',linewidth=5,annot=True)
Out[24]:
In [27]:
sns.clustermap(fp,cmap='coolwarm',standard_scale=1)
Out[27]:
grids¶
In [28]:
iris=sns.load_dataset('iris')
In [30]:
iris.head()
Out[30]:
In [45]:
import matplotlib.pyplot as plt
%matplotlib inline
g=sns.PairGrid(iris)
g.map_diag(sns.distplot)
g.map_upper(plt.scatter)
g.map_lower(sns.kdeplot)
Out[45]:
In [46]:
tips.head()
Out[46]:
In [50]:
g=sns.FacetGrid(data=tips,col='time',row='smoker')
g.map(sns.distplot,'total_bill')
Out[50]:
regression plot¶
In [55]:
sns.lmplot(x='total_bill',y='tip',data=tips,hue='sex',markers=['o','v'])
Out[55]:
In [60]:
sns.lmplot(x='total_bill',y='tip',data=tips,col='day')
Out[60]:
pandas built-in data visulization¶
In [1]:
import numpy as np
In [2]:
import pandas as pd
In [7]:
%matplotlib inline
In [8]:
import seaborn as sns
In [9]:
tips=sns.load_dataset('tips',index_col=0)
In [11]:
tips.head()
Out[11]:
In [15]:
tips['tip'].hist(bins=20) #histogram graph
Out[15]:
In [19]:
tips['tip'].plot(kind='hist',bins=20) #overall method to plot graph
Out[19]:
In [25]:
tips.plot.bar(stacked=True)
Out[25]:
In [33]:
tips.plot.line(x='tip',y='size',figsize=(12,3),lw=1)
Out[33]:
In [39]:
tips.plot.scatter(x='size',y='tip')
Out[39]:
In [40]:
tips.plot.box()
Out[40]:
In [ ]:
In [42]:
d=pd.DataFrame(np.random.randn(1000,2),columns=['a','b'])
In [43]:
d.head()
Out[43]:
In [46]:
d.plot.hexbin(x='a',y='b',gridsize=25,cmap='coolwarm')
Out[46]:
In [48]:
d.plot.kde()
Out[48]:
plotly and cufflinks¶
In [1]:
import pandas as pd
import numpy as np
In [ ]:
In [2]:
from plotly import __version__
In [3]:
print(__version__)
In [4]:
import cufflinks as cf
In [5]:
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
In [6]:
init_notebook_mode(connected=True)
In [7]:
cf.go_offline()
In [8]:
df=pd.DataFrame(np.random.randn(100,4),columns=['a','b','c','d'])
In [9]:
df.head()
Out[9]:
In [10]:
df1=pd.DataFrame({'category':['a','b','c'],'values':[23,43,54]})
In [11]:
df1
Out[11]:
In [12]:
%matplotlib inline
In [13]:
df.iplot()
In [14]:
df.iplot(kind='scatter',x='a',y='b')
In [ ]:
In [15]:
df1.iplot(kind='bar',x='category',y='values')
In [16]:
df.sum().iplot(kind='bar')
In [17]:
df.iplot(kind='box')
In [18]:
df2=pd.DataFrame({'x':[1,2,3,4,5],'y':[10,20,30,20,10],'z':[5,4,3,2,1]})
In [ ]:
In [19]:
df2
Out[19]:
In [20]:
df2.iplot(kind='surface',colorscale='rdylbu')
In [21]:
df.iplot(kind='hist')
In [22]:
df[['a','b']].iplot(kind='spread')
In [23]:
df.iplot(kind='bubble',x='a',y='b',size='c')
In [24]:
df.scatter_matrix()
geographical plot¶
chloropeth map¶
In [25]:
import chart_studio.plotly as py
In [26]:
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
In [27]:
init_notebook_mode(connected=True)
In [28]:
import plotly.graph_objs as go
In [29]:
data=dict(type='choropleth',locations=['AZ','CA','NY'],locationmode='USA-states',colorscale='portland',
text=['text1','text2','text3'],z=[1.0,2.0,3.0],colorbar={'title':'colorbar title goes here'})
In [30]:
data
Out[30]:
In [31]:
layout=dict(geo={'scope':'usa'})
In [32]:
choromap=go.Figure(data=[data],layout=layout)
In [33]:
iplot(choromap)
In [34]:
import pandas as pd
# df=pd.read_csv('2011_US_AGRI_EXPORTS')
In [35]:
# df=pd.read_excel("C:\\Users\\hp\\Downloads\\WEO_Data (1).xls")
Machine Learning with python¶
In [36]:
#from sklearn.family import model(example:model should be Linear Regression)
In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [38]:
%matplotlib inline
In [39]:
df=pd.read_csv('file:///C:/Users/hp/AppData/Local/Temp/Temp1_usa-housing.zip/USA_Housing.csv')
In [ ]:
df
In [ ]:
df.head()
In [ ]:
df.info()
In [40]:
df.columns
Out[40]:
In [41]:
sns.pairplot(df)
Out[41]:
In [42]:
sns.distplot(df['Price'])
In [ ]:
df.corr()
In [ ]:
sns.heatmap(df.corr(),annot=True)
In [ ]:
df.columns
In [ ]:
X=df[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
'Avg. Area Number of Bedrooms', 'Area Population']]
In [ ]:
y=df['Price']
In [ ]:
from sklearn.model_selection import train_test_split
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)
In [ ]:
from sklearn.linear_model import LinearRegression
In [ ]:
lm=LinearRegression()
lm.fit(X_train,y_train)
In [ ]:
print(lm.intercept_)
In [ ]:
lm.coef_
In [ ]:
X_train.columns
In [ ]:
CDF=pd.DataFrame(lm.coef_,X.columns,columns=['Coeff'])
In [ ]:
CDF
In [ ]:
from sklearn.datasets import load_boston
In [ ]:
boston=load_boston()
In [ ]:
boston.keys()
In [ ]:
print(boston['DESCR'])
In [ ]:
predictions=lm.predict(X_test)
In [ ]:
predictions
In [ ]:
y_test
In [ ]:
plt.scatter(y_test,predictions)
In [ ]:
sns.distplot((y_test-predictions))
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Comments
Post a Comment