# -*- coding:utf-8 -*-
# @author Yue Bin ( hi.moonlight@gmail.com )
# @date 2017-06-07
from __future__ import print_function
import os,sys
from mmh3 import hash as mmh3_hash
from mooncake_utils.log import *
from cityhash import CityHash32 as city_hash
logger = get_logger(name="fea")
[docs]class FeatureHasher():
"""
一个简易的特征抽框架
初始化特征类参数
:param size: 特征总维度,也就是哈希桶的数目.
:param hash_module: 采用的哈希库,可选 ``city``, ``mmh3``
:param debug: 打印debug信息
:param print_collision: 是否打印冲突率
:param dense: 是否生成稠密结果
:param use_col_index: 特征采用下标还是用哈希结果
"""
def __init__(self,
size=1000, hash_module="city",
debug=False, print_collision=False,
dense = False, use_col_index = False):
self.size = size
self.debug = debug
self._collision = {}
self.print_collision = print_collision
self.dense = dense
self.use_col_index = use_col_index
self.__init_variable()
if hash_module == "mmh3":
self._hashlib = mmh3_hash
elif hash_module == "city":
self._hashlib = city_hash
else:
raise Exception("unknown hash function")
def __init_variable(self):
self.__counter = 0
def __hash(self, obj):
ret = abs(self._hashlib(obj)) % self.size
if self.print_collision:
if ret not in self._collision:
self._collision[ret] = {}
self._collision[ret][obj] = 1
return ret
[docs] def collision(self):
if not self.print_collision:
return
cnt = 0.0
for key in self._collision:
if len(self._collision[key]) >1:
cnt+=1
logger.info("collision[%s] total[%s] rate[%.4f%%]" % (cnt,
len(self._collision), 100*cnt/len(self._collision)))
[docs] def string_hash(self, key, value):
h_key = key+value
hash_value = self.__hash(h_key)
if self.debug:
logger.debug("->key[%s] value[%s] / h_key[%s]->[%s] h_value[%s]->[%s]" % (
key, value,h_key,hash_value,value,1))
if self.dense:
return hash_value, hash_value
else:
return hash_value, 1
[docs] def number_hash(self, key, value):
if self.dense:
hash_key = key
else:
hash_key = self.__hash(str(key))
if self.debug:
logger.debug("->key[%s] value[%s] / h_key[%s]->[%s] h_value[%s]->[%s]" % (
key, value, key, hash_key, value, value))
return hash_key, value
[docs] def check_valid(self, obj):
if obj.strip() in ["", "-", "0"]:
return False
else:
return True
[docs] def single_hash(self, key, value, ret, index):
if type(value) == str:
if not self.check_valid(value):
if self.debug:
logger.debug("invalid value[%s]" % value)
return
h_key, h_val = self.string_hash(key, value)
elif type(value) in [int, float]:
if self.use_col_index:
h_key,h_val = index, value
else:
h_key,h_val = self.number_hash(key, value)
else:
raise Exception("unknown")
ret[h_key] = h_val
[docs] def list_hash(self, key, obj, ret):
for item in obj:
self.single_hash("%s_%s" % (self.__counter, key),
item, ret, self.__counter)
self.__counter += 1
[docs] def hash(self, obj):
if self.debug:
logger.debug(obj)
self.__init_variable()
ret = {}
label = None
for u in obj:
if not self.check_valid(u):
continue
if u == "__label__":
label = obj[u]
continue
if type(obj[u]) in [list, tuple]:
self.list_hash(u, obj[u], ret)
else:
self.single_hash(u, obj[u], ret, 0)
if self.dense:
msg = self.dense_format(label, ret)
else:
msg = self.format(label, ret)
if self.debug:
logger.debug("%s" % msg)
return msg
if __name__ == "__main__":
a={"name":"mooncake","age":12,"float":3.333,"nickname":"mooncake","-":"","__label__":343}
b={"name":"moake","age":12,"float":5.333,"nickname":"moake","ffff":"","asdf":"-","vec":["23"]}
c={"name":"moake","age":32,"float":5.33,"vec":["23","moon","-"]}
d={"name":"moake2","age":32,"float":5.33,"vec":["23","moon","-"]}
h={"name":"moake2","age":32,"float":5.33,"vec":[123.2,112.3,44.4]}
h1={"__label__":1,"id":"394848222","vec":[123.2,112.3,44.4]}
h2= {'w2v': [0.007911, -0.093373, -0.15307, -0.024283, -0.044193, 0.160349, -0.024016, 0.007423, 0.149864, 0.135744, 0.016073, 0.045109, -0.011489, -0.105786, 0.097938, -0.091035, 0.170713, 0.086309, -0.019482, -0.05405, -0.193355, -0.106077, -0.065943, 0.091179, 0.133637, -0.038045, 0.125531, 0.163907, -0.087991, 0.088282, 0.185405, -0.042518, -0.005262, 0.038919, 0.011682, 0.041738, -0.150831, 0.060612, 0.165593, -0.113252, 0.021496, -0.0505, 0.049408, -0.149098, -0.106122, 0.162164, 0.174148, 0.081231, -0.013936, -0.14077], 'uid': '66652331', 'zhuboid': '92677035', '__label__': 0}
h4={'w2v': [-0.289897, -0.280452, -0.089623, 0.383446, -0.143555, -0.197646, -0.259489, -0.246846, -0.00203, 0.199725, 0.242156, -0.099511, 0.165036, 0.0781, 0.353059, 0.067087, -0.013154, -0.414995, -0.049902, -0.175679], 'user_w2v': [-0.006176, -0.016736, -0.001631, -0.144528, 0.137523, 0.022742, -0.105139, -0.088976, 0.030469, 0.197202, 0.306016, -0.102512, -0.009773, -0.03308, 0.079476, -0.195709, 0.021524, -0.177388, 0.052616, 0.14131], '__label__': 0}
f = FeatureHasher(size=100000000, hash_module="city",
print_collision=True, debug=True, dense=False,use_col_index =True)
f.hash(a)
f.hash(b)
f.hash(c)
f.hash(d)
f.hash(h)
f.hash(h1)
f.hash(h2)
f.collision()
f.hash(h4)