some handy functions to group continous variables and missing value imputation in dataframe


Following example shows how to group age variable into groups,
and some simple missing value imputaiton proecdures.

There is also an example to transform timestamp variable to week day and hour infomation.

import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin





# utility functions



def age_input(age):





if pd.isnull(age):

return 'missing'



age = int(age)

if age<=20:

return '16-20'

elif age<=24:

return '21-24'

elif age<=34:

return '25-34'

elif age<=44:

return '35-44'

elif age<=54:

return '45-54'

elif age<=64:

return '55-64'

else:

return '65+'



# missing value handelling or imputation in dataframe

def missing_handle(df):





for col in df.columns:



if df[col].dtype==object:

df[col] = df[col].fillna('missing')

elif df[col].dtype == bool:

df[col+'_null'] = df[col].apply(lambda x: 1 if pd.isnull(x) else 0)

df[col] = data[col].fillna(data[col].mode()[0])



else:

df[col] = df[col].fillna(-999)



return df











class dayandhour_Transformer(BaseEstimator, TransformerMixin):



# Class Constructor



def __init__(self):



print('initialized')






# Return self, nothing else to do here



def fit(self, X, y=None):



return self








# Customized transformer method



def transform(self, X_, y=None):



X = X_.copy()



X['dayofweek']=pd.to_datetime(X['sentat']).dt.dayofweek



X['hour']=pd.to_datetime(X['sentat']).dt.hour



X = X.drop('sentat',axis=1)



# apply age group function here



X['age_group'] = X['age'].apply(age_input)

X = X.drop('age',axis=1)







# apply missing handelling here

X = missing_handle(X)





return X


# define the transformer
dayandhour_transformer = dayandhour_Transformer()

# usage example
df_new = dayandhour_transformer.transform(df)



Author: robot learner
Reprint policy: All articles in this blog are used except for special statements CC BY 4.0 reprint policy. If reproduced, please indicate source robot learner !
  TOC