python分离中、英文
本来打算使用正则表达式,但是发现实际情况不能简单的用正则表达式处理,于是参考 http://blog.sina.com.cn/s/blog_933dc4350100x6mu.html 代码如下:
# coding=utf-8
'''
@CreateDate: 2013年 12月 03日 星期二 11:11:39 CST
@FileName:extract.py
@Description:负责对话中产品型号的提取
'''
import re
#restr = r"型号 : (.*), 数量"
restr = u"型号 : (.*), 数量"
p_tem = re.compile(restr, re.I)
numstr = r"^d+.d+"
p_num = re.compile(numstr,re.I)
numstrt = r"^d+*d+"
p_numt = re.compile(numstr,re.I)
timestr = r"^d+:d+:d+"
p_time = re.compile(timestr,re.I)
def is_numt(strs):
    if p_numt.findall(strs):
        return True
    return False
def is_time(strs):
    if p_time.findall(strs):
        return True
    return False
def is_num(strs):
    if p_num.findall(strs):
        return True
    else:
        return False
def get_type(temp_str):
    '''从回答模版中提取型号
    '''
    #temp_str = temp_str.encode('utf-8')
    types = p_tem.findall(temp_str)
    return types
def is_zh(c):
    '''判断c是否是中文字符,这里的c必须是unicode编码后的
    '''
    x = ord(c)  # 转换ascii码
    # Punct & Radicals
    if x >= 0x2e80 and x <= 0x33ff:
        return True
    # Fullwidth Latin Characters
    elif x >= 0xff00 and x <= 0xffef:
        return True
    # CJK Unified Ideographs &
    # CJK Unified Ideographs Extension A
    elif x >= 0x4e00 and x <= 0x9fbb:
        return True
    # CJK Compatibility Ideographs
    elif x >= 0xf900 and x <= 0xfad9:
        return True
    # CJK Unified Ideographs Extension B
    elif x >= 0x20000 and x <= 0x2a6d6:
        return True
    # CJK Compatibility Supplement
    elif x >= 0x2f800 and x <= 0x2fa1d:
        return True
    else:
        return False
def split_zh_en(zh_en_str):
    '''提取非中文字符,由于不需要中文部分所以先把有关中文部分代码注释
    '''
    en_group = []
    #zh_group = []
    #zh_gather = ""
    en_gather = ""
    zh_status = False
    for c in zh_en_str:
        if not zh_status and is_zh(c):
            zh_status = True
            if en_gather != "":
                en_group.append(en_gather)
                en_gather = ""
        elif not is_zh(c) and zh_status:
            zh_status = False
            # if zh_gather != "":
                # zh_group.append(zh_gather)
        if zh_status:
            #zh_gather += c
            pass
        else:
            en_gather += c
            #zh_gather = ""
    if en_gather != "":
        en_group.append(en_gather)
    # elif zh_gather != "":
        # zh_group.append(zh_gather)
    return en_group
if __name__ == "__main__":
    #s = u"阿"
    #print s, ord(s), hex(ord(s))
    #m = is_zh(s)
    #print m
    #st = u"型号 : MAL214699112E3 , 数量 : 210片 "
    #stt = u"AT45DB161E-MHD-T   2450个"
    #x = split_zh_en(stt)
    #print x
 
				