Data Preparation

In this notebook, we prepare the dataset for later statistical modelling and analysis.

Import the packages

Import all the necessary packages for the following analysis.

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

Load the data

raw = pd.read_csv('../data/raw_data.csv')
raw.head()
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 ... 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890 NaN
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 ... 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902 NaN
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 ... 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758 NaN
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 ... 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300 NaN
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 ... 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678 NaN

5 rows × 33 columns

Clean the data

Drop the empty column.

# drop the last column
clean = raw.iloc[:, :-1]
clean.head()
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 ... 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 ... 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 ... 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 ... 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 ... 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678

5 rows × 32 columns

Check and fix the missing values

# count the missing values in each column
clean.isna().sum()
id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

Check and drop the duplicate observations/rows

# count the duplicated observations/rows
clean.duplicated().sum()
0

Check and fix the data type of columns

# print out the type of each column
clean.dtypes
id                           int64
diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave points_worst       float64
symmetry_worst             float64
fractal_dimension_worst    float64
dtype: object

Since diagnosis is a categorical variable, we need to one-hot encode it.

# fit a one hot encoder and use it to transform the data
encoder = OneHotEncoder(drop='first') # drop one category to avoid potential singularity
diagnosis_enc = encoder.fit_transform(clean[['diagnosis']]).toarray()
enc_clean = clean.copy()
enc_clean['diagnosis'] = diagnosis_enc
enc_clean.head()
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0 842302 1.0 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 ... 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890
1 842517 1.0 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 ... 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902
2 84300903 1.0 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 ... 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758
3 84348301 1.0 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 ... 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300
4 84358402 1.0 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 ... 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678

5 rows × 32 columns

After the transformation, diagnosis = 1.0 stands for malignant and diagnosis = 0.0 stands for benign.

Train-val-test Split

First split the dataset into training+validation and testing sets.

train_val, test = train_test_split(enc_clean, test_size=0.15, random_state=159)
train, val = train_test_split(train_val, test_size=0.2, random_state=159)
train.shape[0], val.shape[0], test.shape[0]
(386, 97, 86)

Save the dataset

enc_clean.to_csv('../data/clean.csv', index=False)
train.to_csv('../data/train.csv', index=False)
val.to_csv('../data/val.csv', index=False)
test.to_csv('../data/test.csv', index=False)

Combine the pipeline

We can combine the pipeline above into helper functions.

def clean_data(data, enc_columns=[]):
    """
    Clean the data and one-hot encode the columns/features
    ---
    Arguments:
    data (pandas.DataFrame): the data to clean
    enc_columns (list[str]): the names of the columns to be one-hot encoded 
    
    Returns:
    clean_data (pandas.DataFrame): the cleaned data
    """
    
    # drop the last column
    clean_data = data.iloc[:, :-1]
    
    # drop the observations with missing values
    clean_data = clean_data.dropna()
    
    # drop the duplicated observations
    clean_data = clean_data.drop_duplicates()
    
    # one-hot encode the columns
    enc_clean = clean_data.copy()
    for col in enc_columns:
        encoder = OneHotEncoder(drop='first') # drop the first category to avoid singularity
        diagnosis_enc = encoder.fit_transform(clean_data[[col]]).toarray()
        enc_clean = clean_data.copy()
        enc_clean[col] = diagnosis_enc
    
    clean_data = enc_clean.reset_index().drop('index', axis=1)
    
    return clean_data

def load_data(file_path, enc_columns=[], val_size=0.2, test_size=0.15, random_state=159):
    """
    Read, clean, and split the breast cancer data.
    ---
    Arguments:
    file_path (str): the path to the data we want to load
    enc_columns (list[str]): the names of the columns to be one-hot encoded 
    val_size (float): the percentage of the validation set among train+val sets
    test_size (float): the percentage of the validation set among test
    
    Returns:
    (train, val, test) (tuple(pandas.DataFrame * 3)): the training, validation, and testing sets
    """
    assert (val_size <= 1) and (val_size >= 0), "Invalid validation set size" 
    assert (test_size <= 1) and (test_size >= 0), "Invalid testing set size"
    
    raw = pd.read_csv(file_path)
    clean = clean_data(raw, enc_columns)
    
    # train-val-test split of the data
    train_val, test = train_test_split(clean, test_size=test_size, random_state=random_state)
    train, val = train_test_split(train_val, test_size=val_size, random_state=random_state)
    
    return train, val, test

The following cell contains some testing functions for the helper functions above.

def test_clean_data():
    test_input = pd.DataFrame(data={"a":[1.0, np.nan, 3.0, 1.0],
                                    "b":[1.0, 3.0, 3.0, 1.0],
                                    "c":[2.0, 4, 1, 4]})
    test_output = pd.DataFrame(data={"a":[1.0, 3.0],
                                     "b":[1.0, 3.0]})
    out = clean_data(test_input)
    assert test_output.equals(out)
    
def test_load_data():
    val_size = 0.2
    test_size = 0.2
    train, val, test = load_data('../data/raw_data.csv', val_size=val_size, test_size=test_size)
    assert train.shape == (round(569 * 0.8 * 0.8), 32) and val.shape == (round(569 * 0.8 * 0.2), 32) and test.shape == (round(569 * 0.2), 32)

test_clean_data()
test_load_data()

The following line returns the final things before Combine the pipeline section.

from diagnosis.prepare import load_data

train, val, test = load_data('../data/raw_data.csv', val_size=0.2, test_size=0.15, random_state=159)