Make a word document of two pages of the Explorative data analysis given in document
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 1/16
location date variant
num_sequences perc_sequences num_sequences_total
0 Angola 2020-07-06 Alpha 0 0.0 3
1 Angola 2020-07-06 B.1.1.277 0 0.0 3
2 Angola 2020-07-06 B.1.1.302 0 0.0 3
3 Angola 2020-07-06 B.1.1.519 0 0.0 3
4 Angola 2020-07-06 B.1.160 0 0.0 3
RangeIndex: 100416 entries, 0 to 100415
Data columns (total 6 columns):
# Column Non-Null Count Dtype
— —— ————– —–
0 location 100416 non-null object
1 date 100416 non-null datetime64[ns]
2 variant 100416 non-null object
3 num_sequences 100416 non-null int64
4 perc_sequences 100416 non-null float64
5 num_sequences_total 100416 non-null int64
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 4.6+ MB
Alpha 4184
B.1.1.277 4184
others 4184
S:677P.Pelican 4184
S:677H.Robin1 4184
Omicron 4184
Mu 4184
In [1]: import numpy as np
import pandas as pd
In [2]: import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [3]: from sklearn.linear_model import LinearRegression
In [4]: df = pd.read_csv(‘covid-variants.csv’,
parse_dates=[“date”])
df.head()
Out[4]:
In [5]: df.info()
In [6]: df.variant.value_counts()
Out[6]:
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 2/16
Lambda 4184
Kappa 4184
Iota 4184
Gamma 4184
Eta 4184
Epsilon 4184
Delta 4184
Beta 4184
B.1.620 4184
B.1.367 4184
B.1.258 4184
B.1.221 4184
B.1.177 4184
B.1.160 4184
B.1.1.519 4184
B.1.1.302 4184
non_who 4184
Name: variant, dtype: int64
0 84173
1 2753
2 1405
3 905
4 631
…
1690 1
1719 1
2156 1
1184 1
862 1
Name: num_sequences, Length: 1563, dtype: int64
num_sequences perc_sequences num_sequences_total
count 100416.000000 100416.000000 100416.000000
mean 72.171676 6.154355 1509.582457
std 1669.262169 21.898989 8445.291772
min 0.000000 -0.010000 1.000000
25% 0.000000 0.000000 12.000000
50% 0.000000 0.000000 59.000000
75% 0.000000 0.000000 394.000000
max 142280.000000 100.000000 146170.000000
In [7]: df.num_sequences.value_counts()
Out[7]:
In [8]: df.describe()
Out[8]:
In [9]: plt.figure(figsize=(22,8))
sns.scatterplot(x=”date”, y=”num_sequences”, data=df, hue=”variant”)
Out[9]:
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 3/16
array([‘Angola’, ‘Argentina’, ‘Aruba’, ‘Australia’, ‘Austria’, ‘Bahrain’,
‘Bangladesh’, ‘Belgium’, ‘Belize’, ‘Benin’,
‘Bosnia and Herzegovina’, ‘Botswana’, ‘Brazil’, ‘Brunei’,
‘Bulgaria’, ‘Cambodia’, ‘Cameroon’, ‘Canada’, ‘Chile’, ‘Colombia’,
‘Costa Rica’, ‘Croatia’, ‘Curacao’, ‘Cyprus’, ‘Czechia’, ‘Denmark’,
‘Djibouti’, ‘Dominican Republic’, ‘Ecuador’, ‘Egypt’, ‘Estonia’,
‘Ethiopia’, ‘Fiji’, ‘Finland’, ‘France’, ‘Gambia’, ‘Georgia’,
‘Germany’, ‘Ghana’, ‘Greece’, ‘Guatemala’, ‘Hong Kong’, ‘Hungary’,
‘Iceland’, ‘India’, ‘Indonesia’, ‘Iran’, ‘Iraq’, ‘Ireland’,
‘Israel’, ‘Italy’, ‘Jamaica’, ‘Japan’, ‘Jordan’, ‘Kazakhstan’,
‘Kenya’, ‘Kosovo’, ‘Kuwait’, ‘Latvia’, ‘Lebanon’, ‘Liechtenstein’,
‘Lithuania’, ‘Luxembourg’, ‘Madagascar’, ‘Malawi’, ‘Malaysia’,
‘Maldives’, ‘Malta’, ‘Mauritius’, ‘Mexico’, ‘Moldova’, ‘Monaco’,
‘Mongolia’, ‘Montenegro’, ‘Morocco’, ‘Mozambique’, ‘Nepal’,
‘Netherlands’, ‘New Zealand’, ‘Nigeria’, ‘North Macedonia’,
‘Norway’, ‘Oman’, ‘Pakistan’, ‘Papua New Guinea’, ‘Paraguay’,
‘Peru’, ‘Philippines’, ‘Poland’, ‘Portugal’, ‘Qatar’, ‘Romania’,
‘Russia’, ‘Rwanda’, ‘Senegal’, ‘Serbia’, ‘Seychelles’, ‘Singapore’,
‘Sint Maarten (Dutch part)’, ‘Slovakia’, ‘Slovenia’,
‘South Africa’, ‘South Korea’, ‘Spain’, ‘Sri Lanka’, ‘Suriname’,
‘Sweden’, ‘Switzerland’, ‘Thailand’, ‘Togo’, ‘Trinidad and Tobago’,
‘Turkey’, ‘Uganda’, ‘Ukraine’, ‘United Arab Emirates’,
‘United Kingdom’, ‘United States’, ‘Uruguay’, ‘Vietnam’, ‘Zambia’,
‘Zimbabwe’], dtype=object)
array([‘Alpha’, ‘B.1.1.277’, ‘B.1.1.302’, ‘B.1.1.519’, ‘B.1.160’,
‘B.1.177’, ‘B.1.221’, ‘B.1.258’, ‘B.1.367’, ‘B.1.620’, ‘Beta’,
‘Delta’, ‘Epsilon’, ‘Eta’, ‘Gamma’, ‘Iota’, ‘Kappa’, ‘Lambda’,
‘Mu’, ‘Omicron’, ‘S:677H.Robin1’, ‘S:677P.Pelican’, ‘others’,
‘non_who’], dtype=object)
In [10]: df.location.unique()
Out[10]:
In [11]: df.variant.unique()
Out[11]:
In [12]: for virus in df.variant.unique():
most_cases = df.loc[df[‘variant’] == virus].groupby(‘location’)[‘num_sequences’].ag
most_cases = pd.DataFrame({‘Location’:most_cases.index, ‘Number of Case’:most_cases
plt.figure(figsize=(20,8))
sns.barplot(y=’Location’,x=”Number of Case”,data=most_cases,palette=”plasma_r”)
plt.title(‘COUNTRIES HAVE MORE {} CASES THAN OTHERS’.format(virus).upper(),loc=’cen
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 4/16
C:\Users\shahk\AppData\Local\Temp/ipykernel_4392/2010383229.py:4: RuntimeWarning: More t
han 20 figures have been opened. Figures created through the pyplot interface (`matplotl
ib.pyplot.figure`) are retained until explicitly closed and may consume too much memory.
(To control this warning, see the rcParam `figure.max_open_warning`).
plt.figure(figsize=(20,8))
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 5/16
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 6/16
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 7/16
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 8/16
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 9/16
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 10/16
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 11/16
location variant num_sequences perc_sequences num_sequences_total month year day
In [13]: df[‘month’] = df[‘date’].apply(lambda date: date.month)
df[‘year’] = df[‘date’].apply(lambda date: date.year)
df[‘day’] = df[‘date’].apply(lambda date: date.day)
In [14]: df.drop(‘date’,axis=1, inplace=True)
df.head()
Out[14]:
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 12/16
location variant num_sequences perc_sequences num_sequences_total month year day
0 Angola Alpha 0 0.0 3 7 2020 6
1 Angola B.1.1.277 0 0.0 3 7 2020 6
2 Angola B.1.1.302 0 0.0 3 7 2020 6
3 Angola B.1.1.519 0 0.0 3 7 2020 6
4 Angola B.1.160 0 0.0 3 7 2020 6
In [15]: df_val1 = df.loc[df[“variant”]== virus].groupby(‘month’)[‘num_sequences’].agg(‘sum’).so
df_val1 = pd.DataFrame({‘Month’:df_val1.index, ‘Number of Cases’:df_val1.values})
In [16]: plt.figure(figsize=(14,8))
sns.barplot(x=’Month’, y=’Number of Cases’,data=df_val1);
plt.title(‘Monthly Cases Ratio Of All Summed Variant’,fontweight=”bold”);
In [17]: df_val1 = df.loc[df[“variant”]== virus].groupby(‘day’)[‘num_sequences’].agg(‘sum’).sort_
df_val1 = pd.DataFrame({‘Day’:df_val1.index, ‘Number of Cases’:df_val1.values})
plt.figure(figsize=(14,8))
sns.barplot(x=’Day’, y=’Number of Cases’,data=df_val1);
plt.title(‘Daily Cases Ratio Of All Summed Variant’,fontweight=”bold”);
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 13/16
In [18]: df_val1 = df.loc[df[“variant”]== virus].groupby(‘year’)[‘num_sequences’].agg(‘sum’).sor
df_val1 = pd.DataFrame({‘Year’:df_val1.index, ‘Number of Cases’:df_val1.values})
plt.figure(figsize=(14,8))
sns.barplot(x=’Year’, y=’Number of Cases’,data=df_val1);
plt.title(‘Over All Cases Ratio With Year Of All Variant’,fontweight=”bold”);
In [19]: df_val1 = df.loc[df[“variant”]== ‘Omicron’].groupby(‘month’)[‘num_sequences’].agg(‘sum’
df_val1 = pd.DataFrame({‘Month’:df_val1.index, ‘Number of Cases’:df_val1.values})
plt.figure(figsize=(14,8))
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 14/16
sns.barplot(x=’Month’, y=’Number of Cases’,data=df_val1);
plt.title(‘Omicron Cases Montly Ratio’,fontweight=”bold”);
In [20]: df_val1 = df.loc[df[“variant”]== ‘Omicron’].groupby(‘day’)[‘num_sequences’].agg(‘sum’).
df_val1 = pd.DataFrame({‘Day’:df_val1.index, ‘Number of Cases’:df_val1.values})
plt.figure(figsize=(14,8))
sns.barplot(x=’Day’, y=’Number of Cases’,data=df_val1);
plt.title(‘Omicron Cases Daily Ratio’,fontweight=”bold”);
In [21]: df_val1 = df.loc[df[“variant”]== ‘Omicron’].groupby(‘year’)[‘num_sequences’].agg(‘sum’)
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 15/16
df_val1 = pd.DataFrame({‘Year’:df_val1.index, ‘Number of Cases’:df_val1.values})
plt.figure(figsize=(14,8))
sns.barplot(x=’Year’, y=’Number of Cases’,data=df_val1);
plt.title(‘Omicron Cases Yearly Ratio’,fontweight=”bold”);
In [22]: df_val1 = df.loc[df[“variant”]== ‘Omicron’].groupby(‘location’)[‘num_sequences’].agg(‘s
df_val1 = pd.DataFrame({‘Location’:df_val1.index, ‘Number of Cases’:df_val1.values})
plt.figure(figsize=(16,8))
sns.barplot(x=’Location’, y=’Number of Cases’,data=df_val1);
plt.title(‘Highest Omicron Cases Location’,fontweight=”bold”);
plt.xticks(rotation=30);
2/2/22, 7:33 PM EDA 2 covid-variants
localhost:8888/nbconvert/html/Downloads/EDA 2 covid-variants.ipynb?download=false 16/16
In [ ]: