From Single Trees to Forests: Enhancing Real Estate Predictions with Ensembles

From Single Trees to Forests: Enhancing Real Estate Predictions with Ensembles


# Import necessary libraries for preprocessing

import pandas as pd

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, FunctionTransformer

from sklearn.compose import ColumnTransformer

 

# Load the dataset

Ames = pd.read_csv(‘Ames.csv’)

 

# Convert the below numeric features to categorical features

Ames[‘MSSubClass’] = Ames[‘MSSubClass’].astype(‘object’)

Ames[‘YrSold’] = Ames[‘YrSold’].astype(‘object’)

Ames[‘MoSold’] = Ames[‘MoSold’].astype(‘object’)

 

# Exclude ‘PID’ and ‘SalePrice’ from features and specifically handle the ‘Electrical’ column

numeric_features = Ames.select_dtypes(include=[‘int64’, ‘float64’]).drop(columns=[‘PID’, ‘SalePrice’]).columns

categorical_features = Ames.select_dtypes(include=[‘object’]).columns.difference([‘Electrical’])

electrical_feature = [‘Electrical’]

 

# Manually specify the categories for ordinal encoding according to the data dictionary

ordinal_order = {

    ‘Electrical’: [‘Mix’, ‘FuseP’, ‘FuseF’, ‘FuseA’, ‘SBrkr’],  # Electrical system

    ‘LotShape’: [‘IR3’, ‘IR2’, ‘IR1’, ‘Reg’],  # General shape of property

    ‘Utilities’: [‘ELO’, ‘NoSeWa’, ‘NoSewr’, ‘AllPub’],  # Type of utilities available

    ‘LandSlope’: [‘Sev’, ‘Mod’, ‘Gtl’],  # Slope of property

    ‘ExterQual’: [‘Po’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’],  # Evaluates the quality of the material on the exterior

    ‘ExterCond’: [‘Po’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’],  # Evaluates the present condition of the material on the exterior

    ‘BsmtQual’: [‘None’, ‘Po’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’],  # Height of the basement

    ‘BsmtCond’: [‘None’, ‘Po’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’],  # General condition of the basement

    ‘BsmtExposure’: [‘None’, ‘No’, ‘Mn’, ‘Av’, ‘Gd’],  # Walkout or garden level basement walls

    ‘BsmtFinType1’: [‘None’, ‘Unf’, ‘LwQ’, ‘Rec’, ‘BLQ’, ‘ALQ’, ‘GLQ’],  # Quality of basement finished area

    ‘BsmtFinType2’: [‘None’, ‘Unf’, ‘LwQ’, ‘Rec’, ‘BLQ’, ‘ALQ’, ‘GLQ’],  # Quality of second basement finished area

    ‘HeatingQC’: [‘Po’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’],  # Heating quality and condition

    ‘KitchenQual’: [‘Po’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’],  # Kitchen quality

    ‘Functional’: [‘Sal’, ‘Sev’, ‘Maj2’, ‘Maj1’, ‘Mod’, ‘Min2’, ‘Min1’, ‘Typ’],  # Home functionality

    ‘FireplaceQu’: [‘None’, ‘Po’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’],  # Fireplace quality

    ‘GarageFinish’: [‘None’, ‘Unf’, ‘RFn’, ‘Fin’],  # Interior finish of the garage

    ‘GarageQual’: [‘None’, ‘Po’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’],  # Garage quality

    ‘GarageCond’: [‘None’, ‘Po’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’],  # Garage condition

    ‘PavedDrive’: [‘N’, ‘P’, ‘Y’],  # Paved driveway

    ‘PoolQC’: [‘None’, ‘Fa’, ‘TA’, ‘Gd’, ‘Ex’],  # Pool quality

    ‘Fence’: [‘None’, ‘MnWw’, ‘GdWo’, ‘MnPrv’, ‘GdPrv’]  # Fence quality

}

 

# Extract list of ALL ordinal features from dictionary

ordinal_features = list(ordinal_order.keys())

 

# List of ordinal features except Electrical

ordinal_except_electrical = [feature for feature in ordinal_features if feature != ‘Electrical’]

 

# Helper function to fill ‘None’ for missing categorical data

def fill_none(X):

    return X.fillna(“None”)

 

# Pipeline for ‘Electrical’: Fill missing value with mode then apply ordinal encoding

electrical_transformer = Pipeline(steps=[

    (‘impute_electrical’, SimpleImputer(strategy=‘most_frequent’)),

    (‘ordinal_electrical’, OrdinalEncoder(categories=[ordinal_order[‘Electrical’]]))

])

 

# Pipeline for numeric features: Impute missing values using mean

numeric_transformer = Pipeline(steps=[

    (‘impute_mean’, SimpleImputer(strategy=‘mean’))

])

 

# Pipeline for ordinal features: Fill missing values with ‘None’ then apply ordinal encoding

ordinal_transformer = Pipeline(steps=[

    (‘fill_none’, FunctionTransformer(fill_none, validate=False)),

    (‘ordinal’, OrdinalEncoder(categories=[ordinal_order[feature] for feature in ordinal_features if feature in ordinal_except_electrical]))

])

 

# Pipeline for nominal categorical features: Fill missing values with ‘None’ then apply one-hot encoding

nominal_features = [feature for feature in categorical_features if feature not in ordinal_features]

categorical_transformer = Pipeline(steps=[

    (‘fill_none’, FunctionTransformer(fill_none, validate=False)),

    (‘onehot’, OneHotEncoder(handle_unknown=‘ignore’))

])

 

# Combined preprocessor for numeric, ordinal, nominal, and specific electrical data

preprocessor = ColumnTransformer(

    transformers=[

        (‘electrical’, electrical_transformer, [‘Electrical’]),

        (‘num’, numeric_transformer, numeric_features),

        (‘ordinal’, ordinal_transformer, ordinal_except_electrical),

        (‘nominal’, categorical_transformer, nominal_features)

])

 

# Apply the preprocessing pipeline to Ames

transformed_data = preprocessor.fit_transform(Ames).toarray()

 

# Generate column names for the one-hot encoded features

onehot_features = preprocessor.named_transformers_[‘nominal’].named_steps[‘onehot’].get_feature_names_out()

 

# Combine all feature names

all_feature_names = [‘Electrical’] + list(numeric_features) + list(ordinal_except_electrical) + list(onehot_features)

 

# Convert the transformed array to a DataFrame

transformed_df = pd.DataFrame(transformed_data, columns=all_feature_names)



Source link

Comments

No comments yet. Why don’t you start the discussion?

Leave a Reply

Your email address will not be published. Required fields are marked *