Data Cleaning
Data CleaningΒΆ
# Import packages
import pandas as pd
from tools import utils
# Load data
data = pd.read_csv("data/Algerian_forest_fires_dataset_UPDATE.csv")
# Text replacement
data["Bejaia Region Dataset "] = (data["Bejaia Region Dataset "]
.str.replace(r'\s', '', regex=True))
data = data.reset_index()
# Specifiy column names and correct index
column_names = data.iloc[0, :].str.replace(r'\s', '', regex=True).values
header_index = data[data["Bejaia Region Dataset "] == "Classes"].index
# Put cleaned data together
cleaned = utils.restructure_data(data, header_index, column_names)
print(f"Cleaned data have shape {cleaned.shape}")
cleaned.head()
Cleaned data have shape (243, 15)
day | month | year | Temperature | RH | Ws | Rain | FFMC | DMC | DC | ISI | BUI | FWI | Classes | Region | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 01 | 06 | 2012 | 29 | 57 | 18 | 0 | 65.7 | 3.4 | 7.6 | 1.3 | 3.4 | 0.5 | notfire | Bejaia |
2 | 02 | 06 | 2012 | 29 | 61 | 13 | 1.3 | 64.4 | 4.1 | 7.6 | 1 | 3.9 | 0.4 | notfire | Bejaia |
3 | 03 | 06 | 2012 | 26 | 82 | 22 | 13.1 | 47.1 | 2.5 | 7.1 | 0.3 | 2.7 | 0.1 | notfire | Bejaia |
4 | 04 | 06 | 2012 | 25 | 89 | 13 | 2.5 | 28.6 | 1.3 | 6.9 | 0 | 1.7 | 0 | notfire | Bejaia |
5 | 05 | 06 | 2012 | 27 | 77 | 16 | 0 | 64.8 | 3 | 14.2 | 1.2 | 3.9 | 0.5 | notfire | Bejaia |
# Output cleaned data
cleaned.to_csv("data/Algerian_forest_fires_dataset_CLEANED.csv")