From Single Trees to Forests: Enhancing Real Estate Predictions with Ensembles

# Import necessary libraries for preprocessing

import pandas as pd

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, FunctionTransformer

from sklearn.compose import ColumnTransformer

# Load the dataset

Ames = pd.read_csv(‘Ames.csv’)

# Convert the below numeric features to categorical features

Ames[‘MSSubClass’] = Ames[‘MSSubClass’].astype(‘object’)

Ames[‘YrSold’] = Ames[‘YrSold’].astype(‘object’)

Ames[‘MoSold’] = Ames[‘MoSold’].astype(‘object’)

# Exclude ‘PID’ and ‘SalePrice’ from features and specifically handle the ‘Electrical’ column

numeric_features = Ames.select_dtypes(include=[‘int64’, ‘float64’]).drop(columns=[‘PID’, ‘SalePrice’]).columns

categorical_features = Ames.select_dtypes(include=[‘object’]).columns.difference([‘Electrical’])

electrical_feature = [‘Electrical’]

# Manually specify the categories for ordinal encoding according to the data dictionary

ordinal_order = {

‘Electrical’: [‘Mix’, ‘FuseP’, ‘FuseF’, ‘FuseA’, ‘SBrkr’], # Electrical system

‘LotShape’: [‘IR3’, ‘IR2’, ‘IR1’, ‘Reg’], # General shape of property

‘Utilities’: [‘ELO’, ‘NoSeWa’, ‘NoSewr’, ‘AllPub’], # Type of utilities available

‘LandSlope’: [‘Sev’, ‘Mod’, ‘Gtl’], # Slope of property

‘ExterQual’: [‘Po’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’], # Evaluates the quality of the material on the exterior

‘ExterCond’: [‘Po’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’], # Evaluates the present condition of the material on the exterior

‘BsmtQual’: [‘None’, ‘Po’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’], # Height of the basement

‘BsmtCond’: [‘None’, ‘Po’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’], # General condition of the basement

‘BsmtExposure’: [‘None’, ‘No’, ‘Mn’, ‘Av’, ‘Gd’], # Walkout or garden level basement walls

‘BsmtFinType1’: [‘None’, ‘Unf’, ‘LwQ’, ‘Rec’, ‘BLQ’, ‘ALQ’, ‘GLQ’], # Quality of basement finished area

‘BsmtFinType2’: [‘None’, ‘Unf’, ‘LwQ’, ‘Rec’, ‘BLQ’, ‘ALQ’, ‘GLQ’], # Quality of second basement finished area

‘HeatingQC’: [‘Po’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’], # Heating quality and condition

‘KitchenQual’: [‘Po’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’], # Kitchen quality

‘Functional’: [‘Sal’, ‘Sev’, ‘Maj2’, ‘Maj1’, ‘Mod’, ‘Min2’, ‘Min1’, ‘Typ’], # Home functionality

‘FireplaceQu’: [‘None’, ‘Po’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’], # Fireplace quality

‘GarageFinish’: [‘None’, ‘Unf’, ‘RFn’, ‘Fin’], # Interior finish of the garage

‘GarageQual’: [‘None’, ‘Po’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’], # Garage quality

‘GarageCond’: [‘None’, ‘Po’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’], # Garage condition

‘PavedDrive’: [‘N’, ‘P’, ‘Y’], # Paved driveway

‘PoolQC’: [‘None’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’], # Pool quality

‘Fence’: [‘None’, ‘MnWw’, ‘GdWo’, ‘MnPrv’, ‘GdPrv’] # Fence quality

}

# Extract list of ALL ordinal features from dictionary

ordinal_features = list(ordinal_order.keys())

# List of ordinal features except Electrical

ordinal_except_electrical = [feature for feature in ordinal_features if feature != ‘Electrical’]

# Helper function to fill ‘None’ for missing categorical data

def fill_none(X):

return X.fillna(“None”)

# Pipeline for ‘Electrical’: Fill missing value with mode then apply ordinal encoding

electrical_transformer = Pipeline(steps=[

(‘impute_electrical’, SimpleImputer(strategy=‘most_frequent’)),

(‘ordinal_electrical’, OrdinalEncoder(categories=[ordinal_order[‘Electrical’]]))

])

# Pipeline for numeric features: Impute missing values using mean

numeric_transformer = Pipeline(steps=[

(‘impute_mean’, SimpleImputer(strategy=‘mean’))

])

# Pipeline for ordinal features: Fill missing values with ‘None’ then apply ordinal encoding

ordinal_transformer = Pipeline(steps=[

(‘fill_none’, FunctionTransformer(fill_none, validate=False)),

(‘ordinal’, OrdinalEncoder(categories=[ordinal_order[feature] for feature in ordinal_features if feature in ordinal_except_electrical]))

])

# Pipeline for nominal categorical features: Fill missing values with ‘None’ then apply one-hot encoding

nominal_features = [feature for feature in categorical_features if feature not in ordinal_features]

categorical_transformer = Pipeline(steps=[

(‘fill_none’, FunctionTransformer(fill_none, validate=False)),

(‘onehot’, OneHotEncoder(handle_unknown=‘ignore’))

])

# Combined preprocessor for numeric, ordinal, nominal, and specific electrical data

preprocessor = ColumnTransformer(

transformers=[

(‘electrical’, electrical_transformer, [‘Electrical’]),

(‘num’, numeric_transformer, numeric_features),

(‘ordinal’, ordinal_transformer, ordinal_except_electrical),

(‘nominal’, categorical_transformer, nominal_features)

])

# Apply the preprocessing pipeline to Ames

transformed_data = preprocessor.fit_transform(Ames).toarray()

# Generate column names for the one-hot encoded features

onehot_features = preprocessor.named_transformers_[‘nominal’].named_steps[‘onehot’].get_feature_names_out()

# Combine all feature names

all_feature_names = [‘Electrical’] + list(numeric_features) + list(ordinal_except_electrical) + list(onehot_features)

# Convert the transformed array to a DataFrame

transformed_df = pd.DataFrame(transformed_data, columns=all_feature_names)

Source link

From Single Trees to Forests: Enhancing Real Estate Predictions with Ensembles

Comments

Leave a Reply Cancel reply