【创新实训】数据预处理(二)

    技术2022-07-11  85

    分割name

    把name分割成中文名和外文名

    连接数据库

    从setting.json中读取MongoDB的host、用户名和密码 获取movie库

    //连接数据库 with open('setting.json') as f: setting = json.load(f) db = MongoClient("mongodb://{}:27017/movie".format(setting['host']),username=setting['username'],password=setting['password'])["movie"] cc = opencc.OpenCC('t2s')

    百度翻译api

    appid = '20200623000504514' secretKey = 'LC_mF3p9AiAfUhHbzAGs' def common_char(string1, string2): for c in string1: if c in string2: return True return False for item in db['details'].find({'source': 'douban', "name": {"$regex": "[\u4e00-\u9fa5].* .*[^\u4e00-\u9fa5\d]"}}, no_cursor_timeout=True): //选出nameFrn为空的数据 if item.get('nameFrn'): if item['nameFrn'].strip() == '': item['nameFrn'] = '' else: continue

    先用正则表达式分割

    ** match = re.match('(.*[\u4e00-\u9fa5]) ([A-Za-z !.,:-?&()\d]+)$', item['name']) name="" nameFrn="" if match: name = match.group(1) nameFrn = match.group(2)**

    对正则表达式分割失败的name,调用百度翻译api中的语种识别api进行分割

    else: salt = random.randint(32768, 65536) tokens = item['name'].split(' ') flag = False for token in tokens: if flag: nameFrn += (token + " ") continue sign = appid + token + str(salt) + secretKey sign = hashlib.md5(sign.encode()).hexdigest() shiBieUrl = '/api/trans/vip/language' shiBieUrl = shiBieUrl + '?appid=' + appid + '&q=' + urllib.parse.quote( token) + '&salt=' + str(salt) + '&sign=' + sign httpClient = None httpClient = http.client.HTTPConnection('api.fanyi.baidu.com') httpClient.request('GET', shiBieUrl) response = httpClient.getresponse() result_all = response.read().decode("utf-8") result = json.loads(result_all) if result['error_code'] != 0 or result['data']['src'] != "zh": flag = True nameFrn += (token + " ") else: name += (token + " ") name = name.strip() nameFrn = nameFrn.strip() if name == "": name = nameFrn print(name+"|*********|"+nameFrn) if name!="": db['details'].update_one({"sourceId": item["sourceId"],'source': 'douban'},{"$set":{"name":name,"nameFrn":nameFrn}})

    但当这种组合出现时会导致中日文无法正确识别

    做进一步处理

    **//对部分中文日文无法准确识别的name,做进一步处理 for item in db['details'].find({'source': 'douban', "name": {"$regex": "[\u4e00-\u9fa5].* .*[^\u4e00-\u9fa5\d]"}}, no_cursor_timeout=True): tokens = item['name'].split(' ') if re.search('[\u3040-\u31FF\uAC00-\uD7AF\u1100-\u11FF]', tokens[0]): continue if len(tokens) == 2 and re.search('[\u3040-\u31FF\uAC00-\uD7AF\u1100-\u11FF]', tokens[1]): item['name'] = tokens[0] item['nameFrn'] = tokens[1] db['details'].update_one({'_id': item['_id']}, item) //日文 elif re.search('[\u3040-\u31FF]', item['name']): for i in range(1, len(tokens)): if tokens[i] == '': continue potential = cc.convert(tokens[i]) if common_char(potential[:-1], tokens[0][:-1]): item['name'] = ' '.join(tokens[:i]) item['nameFrn'] = ' '.join(tokens[i:]) # print(item['name'], '|----|', item['nameFrn']) db['details'].update_one({'_id': item['_id']}, item) break**

    成功分割

    Processed: 0.016, SQL: 9