import pandas as pd
import seaborn as srn
import statistics as sts
import matplotlib.pyplot as plt
url = 'https://docs.google.com/spreadsheets/d/1jd3c7CpUC0pgSxLVYXSFncqVEea3hLOivc3MlNq-axo/gviz/tq?tqx=out:csv'
dataset = pd.read_csv(url)
dataset.head()
Campaign Name | Clicks | Impressions | CPC | CTR | Conversions | CVR | |
---|---|---|---|---|---|---|---|
0 | Campaign 465 | 348.0 | 4045 | $1.50 | 8.60% | 11 | 3.16% |
1 | Campaign 57 | 908.0 | 17668 | $1.48 | 5.14% | 53 | 5.84% |
2 | Campaign 59 | 331.0 | 6464 | $1.45 | 5.12% | 18 | 5.44% |
3 | Campaign 464 | 909.0 | 17677 | $1.45 | 5.14% | 29 | 3.19% |
4 | Campaign 8 | 542.0 | 5424 | $1.44 | 9.99% | 32 | 5.90% |
# Size
dataset.shape
(500, 7)
# Type
print(dataset.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 500 entries, 0 to 499 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Campaign Name 500 non-null object 1 Clicks 499 non-null float64 2 Impressions 500 non-null int64 3 CPC 499 non-null object 4 CTR 499 non-null object 5 Conversions 500 non-null int64 6 CVR 499 non-null object dtypes: float64(1), int64(2), object(4) memory usage: 27.5+ KB None
# Visualize
dataset.head()
Campaign Name | Clicks | Impressions | CPC | CTR | Conversions | CVR | |
---|---|---|---|---|---|---|---|
0 | Campaign 465 | 348.0 | 4045 | $1.50 | 8.60% | 11 | 3.16% |
1 | Campaign 57 | 908.0 | 17668 | $1.48 | 5.14% | 53 | 5.84% |
2 | Campaign 59 | 331.0 | 6464 | $1.45 | 5.12% | 18 | 5.44% |
3 | Campaign 464 | 909.0 | 17677 | $1.45 | 5.14% | 29 | 3.19% |
4 | Campaign 8 | 542.0 | 5424 | $1.44 | 9.99% | 32 | 5.90% |
dataset['Campaign Name'].describe()
Campaign Name | |
---|---|
count | 500 |
unique | 500 |
top | Campaign 465 |
freq | 1 |
dataset['Clicks'].describe()
Clicks | |
---|---|
count | 499.000000 |
mean | 589.042084 |
std | 233.816528 |
min | 202.000000 |
25% | 376.500000 |
50% | 583.000000 |
75% | 780.000000 |
max | 1000.000000 |
dataset['Impressions'].describe()
Impressions | |
---|---|
count | 500.000000 |
mean | 7089.748000 |
std | 3700.703581 |
min | 2004.000000 |
25% | 4254.750000 |
50% | 6358.000000 |
75% | 8835.000000 |
max | 19414.000000 |
dataset['CPC'].describe()
CPC | |
---|---|
count | 499 |
unique | 85 |
top | $0.69 |
freq | 17 |
dataset['CTR'].describe()
CTR | |
---|---|
count | 499 |
unique | 328 |
top | 7.51% |
freq | 37 |
dataset['Conversions'].describe()
Conversions | |
---|---|
count | 500.000000 |
mean | 28.224000 |
std | 11.949368 |
min | 8.000000 |
25% | 18.000000 |
50% | 28.000000 |
75% | 36.250000 |
max | 58.000000 |
dataset['CVR'].describe()
CVR | |
---|---|
count | 499 |
unique | 225 |
top | 5.82% |
freq | 8 |
# Type
print(dataset.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 500 entries, 0 to 499 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Campaign Name 500 non-null object 1 Clicks 499 non-null float64 2 Impressions 500 non-null int64 3 CPC 499 non-null object 4 CTR 499 non-null object 5 Conversions 500 non-null int64 6 CVR 499 non-null object dtypes: float64(1), int64(2), object(4) memory usage: 27.5+ KB None
# Converting Object to Numeric
temp_cpc = pd.to_numeric(dataset['CPC'], errors='coerce')
print(temp_cpc.head())
0 NaN 1 NaN 2 NaN 3 NaN 4 NaN Name: CPC, dtype: float64
# Replacing commas with periods
temp_cpc = dataset['CPC'].str.replace(',', '.', regex=True)
print(temp_cpc.head())
0 $1.50 1 $1.48 2 $1.45 3 $1.45 4 $1.44 Name: CPC, dtype: object
# Removing non-numeric characters such as symbols or spaces
temp_cpc = dataset['CPC'].str.replace('[^\d.]', '', regex=True)
print(temp_cpc.head())
0 1.50 1 1.48 2 1.45 3 1.45 4 1.44 Name: CPC, dtype: object
# Converting Object to Numeric
temp_cpc = pd.to_numeric(temp_cpc, errors='coerce')
print(temp_cpc.head())
0 1.50 1 1.48 2 1.45 3 1.45 4 1.44 Name: CPC, dtype: float64
dataset['CPC'] = temp_cpc
print(dataset['CPC'].head())
0 1.50 1 1.48 2 1.45 3 1.45 4 1.44 Name: CPC, dtype: float64
# Converting Object to Numeric
temp_ctr = pd.to_numeric(dataset['CTR'], errors='coerce')
print(temp_ctr.head())
0 NaN 1 NaN 2 NaN 3 NaN 4 NaN Name: CTR, dtype: float64
# Replacing commas with periods
temp_ctr = dataset['CTR'].str.replace(',', '.', regex=True)
print(temp_ctr.head())
0 8.60% 1 5.14% 2 5.12% 3 5.14% 4 9.99% Name: CTR, dtype: object
# Removing non-numeric characters such as symbols or spaces
temp_ctr = dataset['CTR'].str.replace('[^\d.]', '', regex=True)
print(temp_ctr.head())
0 8.60 1 5.14 2 5.12 3 5.14 4 9.99 Name: CTR, dtype: object
# Converting Object to Numeric
temp_ctr = pd.to_numeric(temp_ctr, errors='coerce')
print(temp_ctr.head())
0 8.60 1 5.14 2 5.12 3 5.14 4 9.99 Name: CTR, dtype: float64
dataset['CTR'] = temp_ctr
print(dataset['CTR'].head())
0 8.60 1 5.14 2 5.12 3 5.14 4 9.99 Name: CTR, dtype: float64
# Converting Object to Numeric
temp_cvr = pd.to_numeric(dataset['CVR'], errors='coerce')
print(temp_cvr.head())
0 NaN 1 NaN 2 NaN 3 NaN 4 NaN Name: CVR, dtype: float64
# Replacing commas with periods
temp_cvr = dataset['CVR'].str.replace(',', '.', regex=True)
print(temp_cvr.head())
0 3.16% 1 5.84% 2 5.44% 3 3.19% 4 5.90% Name: CVR, dtype: object
# Removing non-numeric characters such as symbols or spaces
temp_cvr = dataset['CVR'].str.replace('[^\d.]', '', regex=True)
print(temp_cvr.head())
0 3.16 1 5.84 2 5.44 3 3.19 4 5.90 Name: CVR, dtype: object
# Converting Object to Numeric
temp_cvr = pd.to_numeric(temp_cvr, errors='coerce')
print(temp_cvr.head())
0 3.16 1 5.84 2 5.44 3 3.19 4 5.90 Name: CVR, dtype: float64
dataset['CVR'] = temp_cvr
print(dataset['CVR'].head())
0 3.16 1 5.84 2 5.44 3 3.19 4 5.90 Name: CVR, dtype: float64
# Type
print(dataset.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 500 entries, 0 to 499 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Campaign Name 500 non-null object 1 Clicks 499 non-null float64 2 Impressions 500 non-null int64 3 CPC 499 non-null float64 4 CTR 499 non-null float64 5 Conversions 500 non-null int64 6 CVR 499 non-null float64 dtypes: float64(4), int64(2), object(1) memory usage: 27.5+ KB None
srn.boxplot(dataset['Clicks'])
plt.show()
srn.histplot(dataset['Clicks'],bins=5,kde=True)
<Axes: xlabel='Clicks', ylabel='Count'>
srn.boxplot(dataset['Impressions'])
<Axes: ylabel='Impressions'>
srn.histplot(dataset['Impressions'],bins=5,kde=True)
<Axes: xlabel='Impressions', ylabel='Count'>
srn.boxplot(dataset['CPC'])
<Axes: ylabel='CPC'>
srn.histplot(dataset['CPC'],bins=5)
<Axes: xlabel='CPC', ylabel='Count'>
dataset['CPC'].describe()
CPC | |
---|---|
count | 499.000000 |
mean | 1.359960 |
std | 11.155968 |
min | 0.440000 |
25% | 0.640000 |
50% | 0.820000 |
75% | 1.100000 |
max | 250.000000 |
# Creating a temporary series to show Boxplot without Outliers
temp_cpc2 = dataset['CPC']
print(temp_cpc2.head())
0 1.50 1 1.48 2 1.45 3 1.45 4 1.44 Name: CPC, dtype: float64
Q1 = temp_cpc2.quantile(0.25)
Q3 = temp_cpc2.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
filtered_data = temp_cpc2[(temp_cpc2 >= lower_bound) & (temp_cpc2 <= upper_bound)]
# Boxplot without Outliers
srn.boxplot(filtered_data)
<Axes: ylabel='CPC'>
# Histogram without Outliers
bins = [0, 0.3, 0.6, 0.9, 1.2, 1.5]
srn.histplot(filtered_data,bins=bins,kde=True)
<Axes: xlabel='CPC', ylabel='Count'>
srn.boxplot(dataset['CTR'])
<Axes: ylabel='CTR'>
srn.histplot(dataset['CTR'],bins=5, kde=True)
<Axes: xlabel='CTR', ylabel='Count'>
srn.boxplot(dataset['Conversions'])
<Axes: ylabel='Conversions'>
srn.histplot(dataset['Conversions'],bins=5,kde=True)
<Axes: xlabel='Conversions', ylabel='Count'>
srn.boxplot(dataset['CVR'])
<Axes: ylabel='CVR'>
srn.histplot(dataset['CVR'],bins=5, kde=True)
<Axes: xlabel='CVR', ylabel='Count'>
# Check NAN
dataset.isnull().sum()
0 | |
---|---|
Campaign Name | 0 |
Clicks | 1 |
Impressions | 0 |
CPC | 1 |
CTR | 1 |
Conversions | 0 |
CVR | 1 |
# Replacing Null with the median as the distribution is asymmetric and has outliers
mediana_clicks = dataset['Clicks'].median()
dataset['Clicks'].fillna(mediana_clicks, inplace=True)
# Replacing Null with the median as the distribution is asymmetric and has outliers
mediana_cpc = dataset['CPC'].median()
dataset['CPC'].fillna(mediana_cpc, inplace=True)
# Replacing Null with the median as the distribution is asymmetric and has outliers
mediana_ctr = dataset['CTR'].median()
dataset['CTR'].fillna(mediana_ctr, inplace=True)
# Replacing Null with the median as the distribution is asymmetric and has outliers
mediana_cvr = dataset['CVR'].median()
dataset['CVR'].fillna(mediana_cvr, inplace=True)
# Check NAN
dataset.isnull().sum()
0 | |
---|---|
Campaign Name | 0 |
Clicks | 0 |
Impressions | 0 |
CPC | 0 |
CTR | 0 |
Conversions | 0 |
CVR | 0 |
dup = dataset.duplicated()
print(dup)
0 False 1 False 2 False 3 False 4 False ... 495 False 496 False 497 False 498 False 499 False Length: 500, dtype: bool
num_dup = dataset.duplicated().sum()
print(num_dup)
0
Q1 = dataset['Clicks'].quantile(0.25)
Q3 = dataset['Clicks'].quantile(0.75)
IQR = Q3 - Q1
limite_inferior = Q1 - 1.5 * IQR
limite_superior = Q3 + 1.5 * IQR
out_clicks = dataset[(dataset['Clicks'] < limite_inferior) | (dataset['Clicks'] > limite_superior)]
print(out_clicks)
Empty DataFrame Columns: [Campaign Name, Clicks, Impressions, CPC, CTR, Conversions, CVR] Index: []
Q1 = dataset['Impressions'].quantile(0.25)
Q3 = dataset['Impressions'].quantile(0.75)
IQR = Q3 - Q1
limite_inferior = Q1 - 1.5 * IQR
limite_superior = Q3 + 1.5 * IQR
out_imp = dataset[(dataset['Impressions'] < limite_inferior) | (dataset['Impressions'] > limite_superior)]
print(out_imp)
Campaign Name Clicks Impressions CPC CTR Conversions CVR 1 Campaign 57 908.0 17668 1.48 5.14 53 5.84 3 Campaign 464 909.0 17677 1.45 5.14 29 3.19 13 Campaign 410 974.0 18328 1.37 5.31 40 4.11 25 Campaign 483 974.0 16172 1.32 6.02 29 2.98 31 Campaign 478 928.0 15901 1.31 5.84 28 3.02 32 Campaign 479 895.0 16135 1.31 5.55 27 3.02 76 Campaign 285 997.0 19414 1.14 5.14 47 4.71 134 Campaign 389 984.0 18411 1.06 5.34 42 4.27 136 Campaign 52 988.0 19380 1.06 5.10 58 5.87 150 Campaign 197 980.0 16228 1.04 6.04 52 5.31 159 Campaign 370 988.0 16374 1.03 6.03 43 4.35 166 Campaign 148 957.0 18328 1.02 5.22 54 5.64 204 Campaign 476 920.0 15901 0.91 5.79 28 3.04 248 Campaign 339 868.0 15883 0.83 5.46 39 4.49 266 Campaign 154 978.0 16209 0.78 6.03 55 5.62 285 Campaign 327 995.0 18230 0.76 5.46 45 4.52 290 Campaign 139 914.0 15883 0.75 5.75 52 5.69 316 Campaign 443 929.0 15750 0.72 5.90 35 3.77 353 Campaign 293 962.0 15901 0.68 6.05 45 4.68 371 Campaign 51 955.0 15901 0.64 6.01 56 5.86 388 Campaign 269 857.0 15999 0.63 5.36 41 4.78 392 Campaign 290 896.0 16144 0.63 5.55 42 4.69 409 Campaign 461 955.0 15994 0.61 5.97 31 3.25 413 Campaign 401 936.0 15815 0.60 5.92 39 4.17 425 Campaign 454 955.0 15994 0.59 5.97 32 3.35 428 Campaign 455 869.0 15891 0.59 5.47 29 3.34 438 Campaign 398 997.0 16388 0.58 6.08 42 4.21 451 Campaign 229 881.0 16004 0.56 5.50 45 5.11 461 Campaign 222 974.0 16172 0.51 6.02 50 5.13 476 Campaign 221 991.0 16332 0.50 6.07 51 5.15 478 Campaign 245 854.0 15999 0.50 5.34 43 5.04
Q1 = dataset['CPC'].quantile(0.25)
Q3 = dataset['CPC'].quantile(0.75)
IQR = Q3 - Q1
limite_inferior = Q1 - 1.5 * IQR
limite_superior = Q3 + 1.5 * IQR
out_cpc = dataset[(dataset['CPC'] < limite_inferior) | (dataset['CPC'] > limite_superior)]
print(out_cpc)
Campaign Name Clicks Impressions CPC CTR Conversions CVR 338 Campaign 317 748.0 10241 250.0 7.3 34 4.55
# Removing Outlier
dataset_new = dataset[(dataset['CPC'] >= limite_inferior) & (dataset['CPC'] <= limite_superior)]
dataset_new.shape
(499, 7)
dataset=dataset_new
dataset.shape
dataset['CPC'].describe()
CPC | |
---|---|
count | 499.000000 |
mean | 0.860601 |
std | 0.256990 |
min | 0.440000 |
25% | 0.640000 |
50% | 0.820000 |
75% | 1.090000 |
max | 1.500000 |
dataset['CPC'].describe()
CPC | |
---|---|
count | 499.000000 |
mean | 0.860601 |
std | 0.256990 |
min | 0.440000 |
25% | 0.640000 |
50% | 0.820000 |
75% | 1.090000 |
max | 1.500000 |
Q1 = dataset['CTR'].quantile(0.25)
Q3 = dataset['CTR'].quantile(0.75)
IQR = Q3 - Q1
limite_inferior = Q1 - 1.5 * IQR
limite_superior = Q3 + 1.5 * IQR
out_ctr = dataset[(dataset['CTR'] < limite_inferior) | (dataset['CTR'] > limite_superior)]
print(out_ctr)
Empty DataFrame Columns: [Campaign Name, Clicks, Impressions, CPC, CTR, Conversions, CVR] Index: []
Q1 = dataset['Conversions'].quantile(0.25)
Q3 = dataset['Conversions'].quantile(0.75)
IQR = Q3 - Q1
limite_inferior = Q1 - 1.5 * IQR
limite_superior = Q3 + 1.5 * IQR
out_conv = dataset[(dataset['Conversions'] < limite_inferior) | (dataset['Conversions'] > limite_superior)]
print(out_conv)
Empty DataFrame Columns: [Campaign Name, Clicks, Impressions, CPC, CTR, Conversions, CVR] Index: []
Q1 = dataset['CVR'].quantile(0.25)
Q3 = dataset['CVR'].quantile(0.75)
IQR = Q3 - Q1
limite_inferior = Q1 - 1.5 * IQR
limite_superior = Q3 + 1.5 * IQR
out_cvr = dataset[(dataset['CVR'] < limite_inferior) | (dataset['CVR'] > limite_superior)]
print(out_cvr)
Empty DataFrame Columns: [Campaign Name, Clicks, Impressions, CPC, CTR, Conversions, CVR] Index: []
dataset[['Clicks', 'Impressions', 'CPC', 'CTR', 'Conversions', 'CVR']].describe()
Clicks | Impressions | CPC | CTR | Conversions | CVR | |
---|---|---|---|---|---|---|
count | 499.000000 | 499.000000 | 499.000000 | 499.000000 | 499.000000 | 499.000000 |
mean | 588.711423 | 7083.432866 | 0.860601 | 8.960982 | 28.212425 | 4.857174 |
std | 233.707926 | 3701.719453 | 0.256990 | 2.398614 | 11.958553 | 0.880457 |
min | 202.000000 | 2004.000000 | 0.440000 | 5.010000 | 8.000000 | 2.560000 |
25% | 376.500000 | 4254.500000 | 0.640000 | 7.310000 | 18.000000 | 4.325000 |
50% | 583.000000 | 6358.000000 | 0.820000 | 8.850000 | 28.000000 | 4.930000 |
75% | 780.000000 | 8827.000000 | 1.090000 | 10.565000 | 36.500000 | 5.645000 |
max | 1000.000000 | 19414.000000 | 1.500000 | 14.960000 | 58.000000 | 6.050000 |
dataset.isnull().any()
0 | |
---|---|
Campaign Name | False |
Clicks | False |
Impressions | False |
CPC | False |
CTR | False |
Conversions | False |
CVR | False |
srn.boxplot(dataset['Clicks'],color='green')
<Axes: ylabel='Clicks'>
srn.histplot(dataset['Clicks'],bins=5,color='green',kde=True)
plt.title('Clicks Distribution', fontsize=16)
Text(0.5, 1.0, 'Clicks Distribution')
srn.boxplot(dataset['Clicks'],color='orange')
<Axes: ylabel='Clicks'>
srn.histplot(dataset['Impressions'],bins=5,color='orange',kde=True)
plt.title('Impressions Distribution', fontsize=16)
Text(0.5, 1.0, 'Impressions Distribution')
srn.boxplot(dataset['CPC'],color='violet')
<Axes: ylabel='CPC'>
bins = [0, 0.3, 0.6, 0.9, 1.2, 1.5]
srn.histplot(dataset['CPC'],bins=bins,color='pink',kde=True)
plt.title('CPC Distribution', fontsize=16)
Text(0.5, 1.0, 'CPC Distribution')
srn.boxplot(dataset['CTR'])
<Axes: ylabel='CTR'>
srn.histplot(dataset['CTR'],bins=5,kde=True)
plt.title('CTR Distribution', fontsize=16)
Text(0.5, 1.0, 'CTR Distribution')
srn.boxplot(dataset['Conversions'],color='purple')
<Axes: ylabel='Conversions'>
srn.histplot(dataset['Conversions'],bins=5,color='purple',kde=True)
plt.title('Conversions Distribution', fontsize=16)
Text(0.5, 1.0, 'Conversions Distribution')
srn.boxplot(dataset['CVR'],color='brown')
<Axes: ylabel='CVR'>
srn.histplot(dataset['CVR'],bins=5,color='brown',kde=True)
plt.title('CVR Distribution', fontsize=16)
Text(0.5, 1.0, 'CVR Distribution')
# Gráfico de dispersão
srn.scatterplot(data=dataset, x='Clicks', y='Impressions')
plt.title('Clicks vs Impressions',fontsize=16)
plt.show()
srn.scatterplot(data=dataset, x='Clicks', y='Conversions', color='green')
plt.title('Clicks vs Conversions',fontsize=16)
plt.show()
srn.scatterplot(data=dataset, x='CPC', y='Conversions', color='red')
plt.title('CPC vs Conversions',fontsize=16)
plt.show()
srn.lineplot(data=dataset, x='CPC', y='Clicks', marker='o', color='blue')
<Axes: xlabel='CPC', ylabel='Clicks'>
srn.lineplot(data=dataset, x='CPC', y='Conversions', marker='o', color='violet')
<Axes: xlabel='CPC', ylabel='Conversions'>
srn.lineplot(data=dataset, x='Clicks', y='Impressions', marker='o', color='green')
<Axes: xlabel='Clicks', ylabel='Impressions'>
srn.lineplot(data=dataset, x='Clicks', y='Conversions', marker='o', color='brown')
<Axes: xlabel='Clicks', ylabel='Conversions'>
# Definindo intervalos de CPC
cpc_bins = [0, 0.3, 0.6, 0.9, 1.2, 1.5]
dataset['CPCbin'] = pd.cut(dataset['CPC'], bins=cpc_bins)
# Média de cliques por intervalo de CPC com observed=False
mean_clicks = dataset.groupby('CPCbin', observed=False)['Clicks'].mean().sort_values()
mean_clicks.plot(kind='barh', color='purple')
plt.title('Average of Clicks per CPC Interval',fontsize=16)
plt.xlabel('Average of Clicks')
plt.ylabel('Average of CPC')
plt.show()
# Definindo intervalos de CPC
cpc_bins = [0, 0.3, 0.6, 0.9, 1.2, 1.5]
dataset['CPCbin'] = pd.cut(dataset['CPC'], bins=cpc_bins)
# Média de cliques por intervalo de CPC com observed=False
mean_conversions = dataset.groupby('CPCbin', observed=False)['Conversions'].mean().sort_values()
mean_conversions.plot(kind='barh',color='green')
plt.title('Average of Conversions per CPC Interval',fontsize=16)
plt.xlabel('Average of Conversions')
plt.ylabel('Average of CPC')
plt.show()