大家好,欢迎来到IT知识分享网。
woe特征转换
class CattoWoe(BaseEstimator, TransformerMixin):
"""
Parameters
----------
label : the label column name
Attributes
----------
woe_dict : dict of intervals,example {'col1':{'xx':0.235}}
Examples
--------
please refer to the readme example
"""
def __init__(self,label,self_woedict=None):
self.label=label
self.self_woedict=self_woedict
def fit(self, df):
"""
df : data only dataframe type
"""
self.df=df
self.woe_dict=woe_transform(df,self.label)
return self
# @classmethod
def transform(self, X=None):
"""Transform X using one-hot encoding.
Parameters
----------
X : dataframe, if you not input it will use fit data,
the data not contain label column
self_woedict: the woe dict by this model fit and save to the file
Returns
-------
df : type dataframe,woe data
"""
df= X if X is not None else self.df
woe_dict= self.self_woedict if self.self_woedict !=None else self.woe_dict
cols=list(filter(lambda item:item not in [self.label,'num'],df.columns))
for attr in cols:
df[attr] = df[attr].map(woe_dict[attr])
if X is None:
df.drop(['num'],axis=1,inplace=True)
return df
def woe_transform(df,label):
#�?前只能�?�理两类�?题,对于多类的可以考虑计算WOE后乘以类�?的占比,相当于加入先验�?�率�?
# save_path = _data_dir / 'woe_iv4.xlsx'
writer = pd.ExcelWriter(r"C:\Users\Administrator\Desktop\project\评分卡\woe_iv4.xlsx")
labels=df[label].unique()
label_one=labels[0]
label_two=labels[1]
df['num']=df.index
offset = 0
def woe_(attr,offset):
pt = pd.pivot_table(df, index=label,columns=attr, values='num', aggfunc='count').T
if pt.empty:
dict_v=dict(zip(df[attr].unique(),[0]))
return dict_v,offset
#todo
else:
pt['WOEi'] = np.log((pt[label_one] / pt[label_one].sum()) /
(pt[label_two] / pt[label_two].sum())).round(4)
pt['IVi'] = pt.WOEi.mul((pt[label_one] / pt[label_one].sum()) -
(pt[label_two] / pt[label_two].sum())).round(3)
iv = pt.IVi.sum()
pt = pt.fillna(0)
key = pt.index.tolist()
value = pt.WOEi.tolist()
dict_v = dict(zip(key, value))
pt.to_excel(writer, 'woe明细', startrow=offset)
offset += (pt.shape[0] + 2)
return dict_v,offset
cols=list(filter(lambda item:item not in [label,'num'],df.columns))
woe_list=[]
for col in cols:
dict_v,offset=woe_(col,offset)
woe_list.append(dict_v)
writer.save()
return dict(zip(cols,woe_list))
#woe
Cw=CattoWoe('classification')
wclf=Cw.fit(dff)
wdf=wclf.transform()
wdf
df
dff
免责声明:本站所有文章内容,图片,视频等均是来源于用户投稿和互联网及文摘转载整编而成,不代表本站观点,不承担相关法律责任。其著作权各归其原作者或其出版社所有。如发现本站有涉嫌抄袭侵权/违法违规的内容,侵犯到您的权益,请在线联系站长,一经查实,本站将立刻删除。 本文来自网络,若有侵权,请联系删除,如若转载,请注明出处:https://yundeesoft.com/23015.html