Data Preparation¶

In this notebook, we prepare the dataset for later statistical modelling and analysis.

Import the packages¶

Import all the necessary packages for the following analysis.

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

Load the data¶

raw = pd.read_csv('../data/raw_data.csv')
raw.head()

	id	diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	...	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst	Unnamed: 32
0	842302	M	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	...	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890	NaN
1	842517	M	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	...	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902	NaN
2	84300903	M	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	...	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758	NaN
3	84348301	M	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	...	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300	NaN
4	84358402	M	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	...	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678	NaN

5 rows × 33 columns

Clean the data¶

Drop the empty column.

# drop the last column
clean = raw.iloc[:, :-1]
clean.head()

	id	diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	...	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst
0	842302	M	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	...	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	842517	M	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	...	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
2	84300903	M	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	...	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
3	84348301	M	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	...	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
4	84358402	M	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	...	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678

5 rows × 32 columns

Check and fix the missing values

# count the missing values in each column
clean.isna().sum()

id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

Check and drop the duplicate observations/rows

# count the duplicated observations/rows
clean.duplicated().sum()

Check and fix the data type of columns

# print out the type of each column
clean.dtypes

id                           int64
diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave points_worst       float64
symmetry_worst             float64
fractal_dimension_worst    float64
dtype: object

Since diagnosis is a categorical variable, we need to one-hot encode it.

# fit a one hot encoder and use it to transform the data
encoder = OneHotEncoder(drop='first') # drop one category to avoid potential singularity
diagnosis_enc = encoder.fit_transform(clean[['diagnosis']]).toarray()
enc_clean = clean.copy()
enc_clean['diagnosis'] = diagnosis_enc
enc_clean.head()

	id	diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	...	radius_worst	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst
0	842302	1.0	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	...	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	842517	1.0	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	...	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
2	84300903	1.0	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	...	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
3	84348301	1.0	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	...	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
4	84358402	1.0	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	...	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678

5 rows × 32 columns

After the transformation, diagnosis = 1.0 stands for malignant and diagnosis = 0.0 stands for benign.

Train-val-test Split¶

First split the dataset into training+validation and testing sets.

train_val, test = train_test_split(enc_clean, test_size=0.15, random_state=159)

train, val = train_test_split(train_val, test_size=0.2, random_state=159)

train.shape[0], val.shape[0], test.shape[0]

(386, 97, 86)

Save the dataset¶

enc_clean.to_csv('../data/clean.csv', index=False)
train.to_csv('../data/train.csv', index=False)
val.to_csv('../data/val.csv', index=False)
test.to_csv('../data/test.csv', index=False)

Combine the pipeline¶

We can combine the pipeline above into helper functions.

def clean_data(data, enc_columns=[]):
    """
    Clean the data and one-hot encode the columns/features
    ---
    Arguments:
    data (pandas.DataFrame): the data to clean
    enc_columns (list[str]): the names of the columns to be one-hot encoded 
    
    Returns:
    clean_data (pandas.DataFrame): the cleaned data
    """
    
    # drop the last column
    clean_data = data.iloc[:, :-1]
    
    # drop the observations with missing values
    clean_data = clean_data.dropna()
    
    # drop the duplicated observations
    clean_data = clean_data.drop_duplicates()
    
    # one-hot encode the columns
    enc_clean = clean_data.copy()
    for col in enc_columns:
        encoder = OneHotEncoder(drop='first') # drop the first category to avoid singularity
        diagnosis_enc = encoder.fit_transform(clean_data[[col]]).toarray()
        enc_clean = clean_data.copy()
        enc_clean[col] = diagnosis_enc
    
    clean_data = enc_clean.reset_index().drop('index', axis=1)
    
    return clean_data

def load_data(file_path, enc_columns=[], val_size=0.2, test_size=0.15, random_state=159):
    """
    Read, clean, and split the breast cancer data.
    ---
    Arguments:
    file_path (str): the path to the data we want to load
    enc_columns (list[str]): the names of the columns to be one-hot encoded 
    val_size (float): the percentage of the validation set among train+val sets
    test_size (float): the percentage of the validation set among test
    
    Returns:
    (train, val, test) (tuple(pandas.DataFrame * 3)): the training, validation, and testing sets
    """
    assert (val_size <= 1) and (val_size >= 0), "Invalid validation set size" 
    assert (test_size <= 1) and (test_size >= 0), "Invalid testing set size"
    
    raw = pd.read_csv(file_path)
    clean = clean_data(raw, enc_columns)
    
    # train-val-test split of the data
    train_val, test = train_test_split(clean, test_size=test_size, random_state=random_state)
    train, val = train_test_split(train_val, test_size=val_size, random_state=random_state)
    
    return train, val, test

The following cell contains some testing functions for the helper functions above.

def test_clean_data():
    test_input = pd.DataFrame(data={"a":[1.0, np.nan, 3.0, 1.0],
                                    "b":[1.0, 3.0, 3.0, 1.0],
                                    "c":[2.0, 4, 1, 4]})
    test_output = pd.DataFrame(data={"a":[1.0, 3.0],
                                     "b":[1.0, 3.0]})
    out = clean_data(test_input)
    assert test_output.equals(out)
    
def test_load_data():
    val_size = 0.2
    test_size = 0.2
    train, val, test = load_data('../data/raw_data.csv', val_size=val_size, test_size=test_size)
    assert train.shape == (round(569 * 0.8 * 0.8), 32) and val.shape == (round(569 * 0.8 * 0.2), 32) and test.shape == (round(569 * 0.2), 32)

test_clean_data()
test_load_data()

The following line returns the final things before Combine the pipeline section.

from diagnosis.prepare import load_data

train, val, test = load_data('../data/raw_data.csv', val_size=0.2, test_size=0.15, random_state=159)

Breast Cancer Data Study

Data Preparation

Contents