DATA ANALYSIS:PANDAS¶

import numpy as np

import pandas as pd

labels=['a','b','c']
my_data=[10,20,30]
arr=np.array(my_data)
d={'a':10,'b':20,'c':30}

arr

array([10, 20, 30])

pandas series¶

pd.Series(data=my_data)

0    10
1    20
2    30
dtype: int64

pd.Series(data=my_data,index=labels)

a    10
b    20
c    30
dtype: int64

pd.Series(d)

a    10
b    20
c    30
dtype: int64

pd.Series(arr)

0    10
1    20
2    30
dtype: int32

pd.Series(labels)

0    a
1    b
2    c
dtype: object

pd.Series([sum,print,len])

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

a=pd.Series([1,2,3,4],['usa','india','china','ussr'])

a

1      usa
2    india
3    china
4     ussr
dtype: object

b=pd.Series(data=[1,2,3,4],index=['delhi','mumbai','goa','imphaql'])

b

delhi      1
mumbai     2
goa        3
imphaql    4
dtype: int64

b['delhi']

1

c=pd.Series(data=labels)

c

0    a
1    b
2    c
dtype: object

c[0]

'a'

a+b

1          NaN
2          NaN
3          NaN
4          NaN
delhi      NaN
goa        NaN
imphaql    NaN
mumbai     NaN
dtype: object

pandas dataframes¶

from numpy.random import randn

np.random.seed(101)

df=pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z']) #DATA,INDEX AND COLUMN

df

df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

type(df['W'])

pandas.core.series.Series

type(df)

pandas.core.frame.DataFrame

df.W

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

df[['W','X']]

df['new']=df['W']+df['X']

df

df.drop('new',axis=1)

df.drop('E',axis=0)

df.shape

(5, 5)

df

df.loc['A'] #or df.iloc[index]

W      2.706850
X      0.628133
Y      0.907969
Z      0.503826
new    3.334983
Name: A, dtype: float64

df.loc['A','X']

0.6281327087844596

bool=df>0

df[bool]

df

df['W']>0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

df[df['W']>0]

df[df['W']>0]['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

arr=df>0

df[arr]

df[(df['W']>0) & (df['X']<1)]

df.reset_index()

newind = 'CA NY WY OR my '.split()

newind

['CA', 'NY', 'WY', 'OR', 'my']

df['states']=newind

df

df.set_index('X')

df

pandas multiindex¶

df=pd.DataFrame(randn(6,2),[1,2,3,4,5,6],[1,2])

df

df.index.names=['ddd']

df

d={'a':[1,2,np.nan],'b':[3,np.nan,5],'c':[6,7,8]}

pd.Series(d)

a    [1, 2, nan]
b    [3, nan, 5]
c      [6, 7, 8]
dtype: object

df=pd.DataFrame(d)

df

df.dropna(axis=1)

df.dropna(thresh=2)  #dropna function

df.fillna(value='k')  #fillna fuction

bb=df.groupby('c')

bb

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000162A0732A48>

bb.mean()

multiindex¶

import numpy as np
import pandas as pd
outside=['g1','g1','g1','g2','g2','g2']
inside=[1,2,3,1,2,3]
a=list(zip(outside,inside))
a=pd.MultiIndex.from_tuples(a)

a

MultiIndex([('g1', 1),
            ('g1', 2),
            ('g1', 3),
            ('g2', 1),
            ('g2', 2),
            ('g2', 3)],
           )

outside

['g1', 'g1', 'g1', 'g2', 'g2', 'g2']

list(zip(outside,inside))

[('g1', 1), ('g1', 2), ('g1', 3), ('g2', 1), ('g2', 2), ('g2', 3)]

from numpy.random import randn
df=pd.DataFrame(randn(6,2),a,['a','b'])

df

df.loc['g1']

df.index.names=['groups','num']

df

df.loc['g2'].loc[2]['a']

-2.0966780980673736

df.xs('g1')

pandas groupby function¶

data={'compony':['google','microsoft','apple','jio','itc','jio'],
      'person':['jay','jash','ashais','anurag','unnati','mansis'],
      'sales':[200,150,123,234,455,566]}

datas=pd.DataFrame(data)

datas

df=datas.groupby('compony')

df.mean()

df.sum()

df.std()

df.describe()

operation¶

df1=pd.DataFrame({'col1':[1,2,3,4],'col2':[333,444,555,666],'col3':['abc','gss','dgf','dsg']})

df1

df1.head()

df1['col2'].unique()

array([333, 444, 555, 666], dtype=int64)

df1['col2'].nunique()

4

df1['col2'].value_counts()

333    1
444    1
555    1
666    1
Name: col2, dtype: int64

def times2(x):
    return x**2

df1['col1'].apply(times2)

0     1
1     4
2     9
3    16
Name: col1, dtype: int64

df1['col1'].apply(lambda x:x+2)

0    3
1    4
2    5
3    6
Name: col1, dtype: int64

df1.columns

Index(['col1', 'col2', 'col3'], dtype='object')

df1.index

RangeIndex(start=0, stop=4, step=1)

df1.sort_values('col2')

df1.isnull()

data input and output¶

1:csv¶

2:excel¶

3:html¶

4:sql¶

import pandas as pd

pwd

'C:\\Users\\hp'

pd.read_csv('C:\Users\hp\Downloads\annual-enterprise-survey-2018-financial-year-provisional-csv (1).csv') #read csv file

  File "<ipython-input-3-94b9d53f99d2>", line 1
    pd.read_csv('C:\Users\hp\Downloads\annual-enterprise-survey-2018-financial-year-provisional-csv (1).csv') #read csv file
               ^
SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape

pd.read_csv("C:\Users\hp\seaborn-data\flights.csv",encoding="ISO=8859-1") #read csv file

  File "<ipython-input-54-062f95b4a9a0>", line 1
    pd.read_csv("C:\Users\hp\seaborn-data\flights.csv",encoding="ISO=8859-1") #read csv file
               ^
SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape

#pd.read_excel('filename')

#df.to_excel("name",shheet_name='')

#data=pd.read_html('')

Data Visulization: MATPLOTLIB¶

import matplotlib.pyplot as plt

%matplotlib inline

plt.show()

import numpy as np
x=np.linspace(0,5,11)
y=x**2

y

array([ 0.  ,  0.25,  1.  ,  2.25,  4.  ,  6.25,  9.  , 12.25, 16.  ,
       20.25, 25.  ])

x

array([0. , 0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])

#functional method
plt.plot(x,y)
plt.xlabel('X label')
plt.ylabel('y label')
plt.title('corona')
plt.show()

plt.subplot(1,2,1) #rows,col,plotno
plt.plot(x,y,'r')
plt.subplot(1,2,2)
plt.plot(y,x,'b')

[<matplotlib.lines.Line2D at 0x24e2d26c808>]

#object oriented method
fig=plt.figure()
axes=fig.add_axes([0.1,0.1,0.8,0.8]) #left,bottom,width and height
axes.plot(x,y)
axes.set_xlabel('xlabel')
axes.set_ylabel('ylabel')
axes.set_title('title')

Text(0.5, 1.0, 'title')

fig=plt.figure()
axes1=fig.add_axes([0.1,0.1,0.8,0.8])

axes1.plot(x,y**2,label='ycube')
axes1.plot(y,x**3,label='x cube')
axes1.legend(loc=10)  #location from 0.....10

<matplotlib.legend.Legend at 0x24e2d1ce088>

fig,axes=plt.subplots(nrows=1,ncols=2)
# axes.plot(x)
for current in axes:
    current.plot(x,y)

fig,axes=plt.subplots(nrows=1,ncols=2)
# axes.plot(x)
# for current in axes:
#     current.plot(x,y)
axes[0].plot(x,y)
axes[1].plot(y,x)

[<matplotlib.lines.Line2D at 0x24a666f9408>]

# figure size and DPI¶

fig=plt.figure(figsize=(8,2))#width,height
ax=fig.add_axes([0,0,1,1])
ax.plot(x,y)

[<matplotlib.lines.Line2D at 0x24a664c5b08>]

fig.savefig('wallpaper3.jpg',dpi=200)

fig=plt.figure()
ax=fig.add_axes([0,0,1,1])
ax.plot(x,y,color='green',linewidth=4,linestyle='-',marker='o',markersize=10,markerfacecolor='yellow') #and also rgb hex code we get variety of color
ax.set_xlim([0,1])
ax.set_ylim([0,1])

(0, 1)

x

array([0. , 0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])

plt.scatter(x,y)

<matplotlib.collections.PathCollection at 0x24a68c665c8>

seaboarn library¶

import seaborn as sns

%matplotlib inline

tips=sns.load_dataset('tips')

tips.head()

sns.distplot(tips['total_bill'],kde=False,bins=100)

<matplotlib.axes._subplots.AxesSubplot at 0x242800a1948>

sns.jointplot(x='total_bill',y='tip',data=tips,kind='kde')

<seaborn.axisgrid.JointGrid at 0x242ff9a4c48>

sns.pairplot(tips,hue='sex',palette='coolwarm')

<seaborn.axisgrid.PairGrid at 0x242ffa9f8c8>

sns.rugplot(tips['total_bill'])

<matplotlib.axes._subplots.AxesSubplot at 0x242fff59448>

sns.distplot(tips['total_bill'])

<matplotlib.axes._subplots.AxesSubplot at 0x242803e8688>

sns.pairplot(tips)

<seaborn.axisgrid.PairGrid at 0x2428043fe48>

sns.jointplot(x='total_bill',y='tip',data=tips)

<seaborn.axisgrid.JointGrid at 0x242819d2248>

categorial plot¶

import seaborn as sns
%matplotlib inline
tips=sns.load_dataset('tips')
tips.head()

import numpy as np
sns.barplot(x='size',y='total_bill',data=tips,estimator=np.std)

<matplotlib.axes._subplots.AxesSubplot at 0x24281d95208>

sns.countplot(x='sex',data=tips)

<matplotlib.axes._subplots.AxesSubplot at 0x24281e10a88>

sns.boxplot(x='day',y='total_bill',data=tips,hue='smoker')

<matplotlib.axes._subplots.AxesSubplot at 0x24281eafd48>

sns.violinplot(x='day',y='total_bill',data=tips,hue='sex',split=True)

<matplotlib.axes._subplots.AxesSubplot at 0x242821b3b88>

sns.stripplot(x='day',y='total_bill',data=tips,hue='sex',split=True)

C:\Users\hp\Anaconda3\lib\site-packages\seaborn\categorical.py:2775: UserWarning: The `split` parameter has been renamed to `dodge`.
  warnings.warn(msg, UserWarning)

<matplotlib.axes._subplots.AxesSubplot at 0x24282398688>

sns.violinplot(x='day',y='total_bill',data=tips)
sns.swarmplot(x='day',y='total_bill',data=tips,color='black')

<matplotlib.axes._subplots.AxesSubplot at 0x2428358e108>

sns.factorplot(x='day',y='total_bill',data=tips,kind='bar')  #general ploting function

<seaborn.axisgrid.FacetGrid at 0x24283658348>

import seaborn as sns
%matplotlib inline
tips=sns.load_dataset('tips')
flights=sns.load_dataset('flights')

tips.head()

flights.head()

tc=tips.corr()

sns.heatmap(tc)

<matplotlib.axes._subplots.AxesSubplot at 0x2428384f908>

tc

matrix plot¶

import pandas as pd
import seaborn as sns
import matplotlib
%matplotlib inline
tips=sns.load_dataset('tips')
flights=sns.load_dataset('flights')
tips.head()

flights.head()

tc=tips.corr()

sns.heatmap(tc,annot=True,cmap='coolwarm')

<matplotlib.axes._subplots.AxesSubplot at 0x287eba66348>

tc

flights

fp=flights.pivot_table(index='month',columns='year',values='passengers')

sns.heatmap(fp,cmap='magma',linecolor='white',linewidth=5,annot=True)

<matplotlib.axes._subplots.AxesSubplot at 0x287ec052188>

sns.clustermap(fp,cmap='coolwarm',standard_scale=1)

<seaborn.matrix.ClusterGrid at 0x287ec815648>

grids¶

iris=sns.load_dataset('iris')

iris.head()

import matplotlib.pyplot as plt
%matplotlib inline
g=sns.PairGrid(iris)
g.map_diag(sns.distplot)
g.map_upper(plt.scatter)
g.map_lower(sns.kdeplot)

<seaborn.axisgrid.PairGrid at 0x287f22a9788>

tips.head()

g=sns.FacetGrid(data=tips,col='time',row='smoker')
g.map(sns.distplot,'total_bill')

<seaborn.axisgrid.FacetGrid at 0x287f3941d88>

regression plot¶

sns.lmplot(x='total_bill',y='tip',data=tips,hue='sex',markers=['o','v'])

<seaborn.axisgrid.FacetGrid at 0x287f38a6148>

sns.lmplot(x='total_bill',y='tip',data=tips,col='day')

<seaborn.axisgrid.FacetGrid at 0x287f4c71e88>

pandas built-in data visulization¶

import numpy as np

import pandas as pd

%matplotlib inline

import seaborn as sns

tips=sns.load_dataset('tips',index_col=0)

tips.head()

tips['tip'].hist(bins=20)   #histogram graph

<matplotlib.axes._subplots.AxesSubplot at 0x1d8c3ec3d08>

tips['tip'].plot(kind='hist',bins=20)   #overall method to plot graph

<matplotlib.axes._subplots.AxesSubplot at 0x1d8c40c46c8>

tips.plot.bar(stacked=True)

<matplotlib.axes._subplots.AxesSubplot at 0x1d8c4b83788>

tips.plot.line(x='tip',y='size',figsize=(12,3),lw=1)

<matplotlib.axes._subplots.AxesSubplot at 0x1d8c5521608>

tips.plot.scatter(x='size',y='tip')

<matplotlib.axes._subplots.AxesSubplot at 0x1d8c65e74c8>

tips.plot.box()

<matplotlib.axes._subplots.AxesSubplot at 0x1d8c6648488>

d=pd.DataFrame(np.random.randn(1000,2),columns=['a','b'])

d.head()

d.plot.hexbin(x='a',y='b',gridsize=25,cmap='coolwarm')

<matplotlib.axes._subplots.AxesSubplot at 0x1d8c67e2288>

d.plot.kde()

<matplotlib.axes._subplots.AxesSubplot at 0x1d8c6922208>

plotly and cufflinks¶

import pandas as pd
import numpy as np

from plotly import __version__

print(__version__)

4.7.0

import cufflinks as cf

from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot

init_notebook_mode(connected=True)

cf.go_offline()

df=pd.DataFrame(np.random.randn(100,4),columns=['a','b','c','d'])

df.head()

df1=pd.DataFrame({'category':['a','b','c'],'values':[23,43,54]})

df1

%matplotlib inline

df.iplot()

df.iplot(kind='scatter',x='a',y='b')

df1.iplot(kind='bar',x='category',y='values')

df.sum().iplot(kind='bar')

df.iplot(kind='box')

df2=pd.DataFrame({'x':[1,2,3,4,5],'y':[10,20,30,20,10],'z':[5,4,3,2,1]})

df2

df2.iplot(kind='surface',colorscale='rdylbu')

df.iplot(kind='hist')

df[['a','b']].iplot(kind='spread')

df.iplot(kind='bubble',x='a',y='b',size='c')

df.scatter_matrix()

geographical plot¶

chloropeth map¶

import chart_studio.plotly as py

from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot

init_notebook_mode(connected=True)

import plotly.graph_objs as go

data=dict(type='choropleth',locations=['AZ','CA','NY'],locationmode='USA-states',colorscale='portland',
         text=['text1','text2','text3'],z=[1.0,2.0,3.0],colorbar={'title':'colorbar title goes here'})

data

{'type': 'choropleth',
 'locations': ['AZ', 'CA', 'NY'],
 'locationmode': 'USA-states',
 'colorscale': 'portland',
 'text': ['text1', 'text2', 'text3'],
 'z': [1.0, 2.0, 3.0],
 'colorbar': {'title': 'colorbar title goes here'}}

layout=dict(geo={'scope':'usa'})

choromap=go.Figure(data=[data],layout=layout)

iplot(choromap)

import pandas as pd
# df=pd.read_csv('2011_US_AGRI_EXPORTS')

# df=pd.read_excel("C:\\Users\\hp\\Downloads\\WEO_Data (1).xls")

Machine Learning with python¶

#from sklearn.family import model(example:model should be Linear Regression)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

df=pd.read_csv('file:///C:/Users/hp/AppData/Local/Temp/Temp1_usa-housing.zip/USA_Housing.csv')

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
~\Anaconda3\lib\urllib\request.py in open_local_file(self, req)
   1472         try:
-> 1473             stats = os.stat(localfile)
   1474             size = stats.st_size

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\hp\\AppData\\Local\\Temp\\Temp1_usa-housing.zip\\USA_Housing.csv'

During handling of the above exception, another exception occurred:

URLError                                  Traceback (most recent call last)
<ipython-input-39-53a044993694> in <module>
----> 1 df=pd.read_csv('file:///C:/Users/hp/AppData/Local/Temp/Temp1_usa-housing.zip/USA_Housing.csv')

~\Anaconda3\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    683         )
    684 
--> 685         return _read(filepath_or_buffer, kwds)
    686 
    687     parser_f.__name__ = name

~\Anaconda3\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
    438     # See https://github.com/python/mypy/issues/1297
    439     fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
--> 440         filepath_or_buffer, encoding, compression
    441     )
    442     kwds["compression"] = compression

~\Anaconda3\lib\site-packages\pandas\io\common.py in get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode)
    194 
    195     if _is_url(filepath_or_buffer):
--> 196         req = urlopen(filepath_or_buffer)
    197         content_encoding = req.headers.get("Content-Encoding", None)
    198         if content_encoding == "gzip":

~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    220     else:
    221         opener = _opener
--> 222     return opener.open(url, data, timeout)
    223 
    224 def install_opener(opener):

~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
    523             req = meth(req)
    524 
--> 525         response = self._open(req, data)
    526 
    527         # post-process response

~\Anaconda3\lib\urllib\request.py in _open(self, req, data)
    541         protocol = req.type
    542         result = self._call_chain(self.handle_open, protocol, protocol +
--> 543                                   '_open', req)
    544         if result:
    545             return result

~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    501         for handler in handlers:
    502             func = getattr(handler, meth_name)
--> 503             result = func(*args)
    504             if result is not None:
    505                 return result

~\Anaconda3\lib\urllib\request.py in file_open(self, req)
   1449                 raise URLError("file:// scheme is supported only on localhost")
   1450         else:
-> 1451             return self.open_local_file(req)
   1452 
   1453     # names for the localhost

~\Anaconda3\lib\urllib\request.py in open_local_file(self, req)
   1488                 return addinfourl(open(localfile, 'rb'), headers, origurl)
   1489         except OSError as exp:
-> 1490             raise URLError(exp)
   1491         raise URLError('file not on local host')
   1492 

URLError: <urlopen error [WinError 3] The system cannot find the path specified: 'C:\\Users\\hp\\AppData\\Local\\Temp\\Temp1_usa-housing.zip\\USA_Housing.csv'>

df

df.head()

df.info()

df.columns

Index(['a', 'b', 'c', 'd'], dtype='object')

sns.pairplot(df)

<seaborn.axisgrid.PairGrid at 0x25b90307548>

sns.distplot(df['Price'])

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2896             try:
-> 2897                 return self._engine.get_loc(key)
   2898             except KeyError:

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Price'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-42-87e11caeb2c4> in <module>
----> 1 sns.distplot(df['Price'])

~\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2978             if self.columns.nlevels > 1:
   2979                 return self._getitem_multilevel(key)
-> 2980             indexer = self.columns.get_loc(key)
   2981             if is_integer(indexer):
   2982                 indexer = [indexer]

~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2897                 return self._engine.get_loc(key)
   2898             except KeyError:
-> 2899                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2900         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2901         if indexer.ndim > 1 or indexer.size > 1:

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'Price'

df.corr()

sns.heatmap(df.corr(),annot=True)

df.columns

X=df[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population']]

y=df['Price']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)

from sklearn.linear_model import LinearRegression

lm=LinearRegression()

lm.fit(X_train,y_train)

print(lm.intercept_)

lm.coef_

X_train.columns

CDF=pd.DataFrame(lm.coef_,X.columns,columns=['Coeff'])

CDF

from sklearn.datasets import load_boston

boston=load_boston()

boston.keys()

print(boston['DESCR'])

predictions=lm.predict(X_test)

predictions

y_test

plt.scatter(y_test,predictions)

sns.distplot((y_test-predictions))

	1	2
1	1.025984	-0.156598
2	-0.031579	0.649826
3	2.154846	-0.610259
4	-0.755325	-0.346419
5	0.147027	-0.479448
6	0.558769	1.024810

	1	2
ddd
1	1.025984	-0.156598
2	-0.031579	0.649826
3	2.154846	-0.610259
4	-0.755325	-0.346419
5	0.147027	-0.479448
6	0.558769	1.024810

		a	b
groups	num
g1	1	0.025110	1.681811
	2	0.373285	0.496837
	3	-0.241096	-1.034506
g2	1	0.931957	1.896131
	2	-2.096678	1.204199
	3	-1.670785	0.565982

		a	b
g1	1	0.025110	1.681811
	2	0.373285	0.496837
	3	-0.241096	-1.034506
g2	1	0.931957	1.896131
	2	-2.096678	1.204199
	3	-1.670785	0.565982

	a	b
1	0.025110	1.681811
2	0.373285	0.496837
3	-0.241096	-1.034506

SourceCode

Search This Blog

Data analysis and visulization with python

DATA ANALYSIS:PANDAS¶

pandas series¶

pandas dataframes¶

pandas multiindex¶

multiindex¶

pandas groupby function¶

operation¶

data input and output¶

1:csv¶

2:excel¶

3:html¶

4:sql¶

Data Visulization: MATPLOTLIB¶

# figure size and DPI¶

seaboarn library¶

categorial plot¶

matrix plot¶

grids¶

regression plot¶

pandas built-in data visulization¶

plotly and cufflinks¶

geographical plot¶

chloropeth map¶

Machine Learning with python¶

Comments

Post a Comment

	W	X	Y	Z
A	2.706850	0.628133	0.907969	0.503826
B	0.651118	-0.319318	-0.848077	0.605965
C	-2.018168	0.740122	0.528813	-0.589001
D	0.188695	-0.758872	-0.933237	0.955057
E	0.190794	1.978757	2.605967	0.683509

	compony	person	sales
0	google	jay	200
1	microsoft	jash	150
2	apple	ashais	123
3	jio	anurag	234
4	itc	unnati	455
5	jio	mansis	566

	sales
	count	mean	std	min	25%	50%	75%	max
compony
apple	1.0	123.0	NaN	123.0	123.0	123.0	123.0	123.0
google	1.0	200.0	NaN	200.0	200.0	200.0	200.0	200.0
itc	1.0	455.0	NaN	455.0	455.0	455.0	455.0	455.0
jio	2.0	400.0	234.759451	234.0	317.0	400.0	483.0	566.0
microsoft	1.0	150.0	NaN	150.0	150.0	150.0	150.0	150.0

	col1	col2	col3
0	False	False	False
1	False	False	False
2	False	False	False
3	False	False	False

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	year	month	passengers
0	1949	January	112
1	1949	February	118
2	1949	March	132
3	1949	April	129
4	1949	May	121

	total_bill	tip	size
total_bill	1.000000	0.675734	0.598315
tip	0.675734	1.000000	0.489299
size	0.598315	0.489299	1.000000

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

	a	b
0	0.158794	-1.032904
1	-0.610498	-0.572200
2	-2.080291	-0.515329
3	0.145124	-0.900696
4	0.798563	1.357391

	a	b	c	d
0	-0.617626	0.950844	-0.826805	0.306526
1	-0.485076	-0.962058	-1.220142	1.153995
2	1.008816	0.692163	-1.303469	0.271443
3	0.987260	1.523529	1.827371	-1.419191
4	-0.418931	-0.222241	0.040562	-2.453487