本来打算使用正则表达式,但是发现实际情况不能简单的用正则表达式处理,于是参考 http://blog.sina.com.cn/s/blog_933dc4350100x6mu.html 代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# coding=utf-8
'''
@CreateDate: 2013年 12月 03日 星期二 11:11:39 CST
@FileName:extract.py
@Description:负责对话中产品型号的提取
'''
import re
#restr = r"型号 : (.*), 数量"
restr = u"型号 : (.*), 数量"
p_tem = re.compile(restr, re.I)
numstr = r"^d+.d+"
p_num = re.compile(numstr,re.I)
numstrt = r"^d+*d+"
p_numt = re.compile(numstr,re.I)
timestr = r"^d+:d+:d+"
p_time = re.compile(timestr,re.I)
def is_numt(strs):
if p_numt.findall(strs):
return True
return False
def is_time(strs):
if p_time.findall(strs):
return True
return False
def is_num(strs):
if p_num.findall(strs):
return True
else:
return False
def get_type(temp_str):
'''从回答模版中提取型号
'''
#temp_str = temp_str.encode('utf-8')
types = p_tem.findall(temp_str)
return types
def is_zh(c):
'''判断c是否是中文字符,这里的c必须是unicode编码后的
'''
x = ord(c) # 转换ascii码
# Punct & Radicals
if x >= 0x2e80 and x <= 0x33ff:
return True
# Fullwidth Latin Characters
elif x >= 0xff00 and x <= 0xffef:
return True
# CJK Unified Ideographs &
# CJK Unified Ideographs Extension A
elif x >= 0x4e00 and x <= 0x9fbb:
return True
# CJK Compatibility Ideographs
elif x >= 0xf900 and x <= 0xfad9:
return True
# CJK Unified Ideographs Extension B
elif x >= 0x20000 and x <= 0x2a6d6:
return True
# CJK Compatibility Supplement
elif x >= 0x2f800 and x <= 0x2fa1d:
return True
else:
return False
def split_zh_en(zh_en_str):
'''提取非中文字符,由于不需要中文部分所以先把有关中文部分代码注释
'''
en_group = []
#zh_group = []
#zh_gather = ""
en_gather = ""
zh_status = False
for c in zh_en_str:
if not zh_status and is_zh(c):
zh_status = True
if en_gather != "":
en_group.append(en_gather)
en_gather = ""
elif not is_zh(c) and zh_status:
zh_status = False
# if zh_gather != "":
# zh_group.append(zh_gather)
if zh_status:
#zh_gather += c
pass
else:
en_gather += c
#zh_gather = ""
if en_gather != "":
en_group.append(en_gather)
# elif zh_gather != "":
# zh_group.append(zh_gather)
return en_group
if __name__ == "__main__":
#s = u"阿"
#print s, ord(s), hex(ord(s))
#m = is_zh(s)
#print m
#st = u"型号 : MAL214699112E3 , 数量 : 210片 "
#stt = u"AT45DB161E-MHD-T 2450个"
#x = split_zh_en(stt)
#print x