分割name
把name分割成中文名和外文名
连接数据库
从setting.json中读取MongoDB的host、用户名和密码 获取movie库
with open('setting.json') as f
:
setting
= json
.load(f
)
db
= MongoClient("mongodb://{}:27017/movie".format(setting
['host']),username
=setting
['username'],password
=setting
['password'])["movie"]
cc
= opencc
.OpenCC('t2s')
百度翻译api
appid
= '20200623000504514'
secretKey
= 'LC_mF3p9AiAfUhHbzAGs'
def
common_char(string1
, string2
):
for c
in string1
:
if c
in string2
:
return True
return False
for item
in db
['details'].find({'source': 'douban', "name": {"$regex": "[\u4e00-\u9fa5].* .*[^\u4e00-\u9fa5\d]"}},
no_cursor_timeout
=True
):
if item
.get('nameFrn'):
if item
['nameFrn'].strip() == '':
item
['nameFrn'] = ''
else:
continue
先用正则表达式分割
**
match
= re
.match('(.*[\u4e00-\u9fa5]) ([A-Za-z !.,:-?&()\d]+)$', item
['name'])
name
=""
nameFrn
=""
if match
:
name
= match
.group(1)
nameFrn
= match
.group(2)**
对正则表达式分割失败的name,调用百度翻译api中的语种识别api进行分割
else:
salt
= random
.randint(32768, 65536)
tokens
= item
['name'].split(' ')
flag
= False
for token
in tokens
:
if flag
:
nameFrn
+= (token
+ " ")
continue
sign
= appid
+ token
+ str(salt
) + secretKey
sign
= hashlib
.md5(sign
.encode()).hexdigest()
shiBieUrl
= '/api/trans/vip/language'
shiBieUrl
= shiBieUrl
+ '?appid=' + appid
+ '&q=' + urllib
.parse
.quote(
token
) + '&salt=' + str(salt
) + '&sign=' + sign
httpClient
= None
httpClient
= http
.client
.HTTPConnection('api.fanyi.baidu.com')
httpClient
.request('GET', shiBieUrl
)
response
= httpClient
.getresponse()
result_all
= response
.read().decode("utf-8")
result
= json
.loads(result_all
)
if result
['error_code'] != 0 or result
['data']['src'] != "zh":
flag
= True
nameFrn
+= (token
+ " ")
else:
name
+= (token
+ " ")
name
= name
.strip()
nameFrn
= nameFrn
.strip()
if name
== "":
name
= nameFrn
print(name
+"|*********|"+nameFrn
)
if name
!="":
db
['details'].update_one({"sourceId": item
["sourceId"],'source': 'douban'},{"$set":{"name":name
,"nameFrn":nameFrn
}})
但当这种组合出现时会导致中日文无法正确识别
做进一步处理
**
for item
in db
['details'].find({'source': 'douban', "name": {"$regex": "[\u4e00-\u9fa5].* .*[^\u4e00-\u9fa5\d]"}},
no_cursor_timeout
=True
):
tokens
= item
['name'].split(' ')
if re
.search('[\u3040-\u31FF\uAC00-\uD7AF\u1100-\u11FF]', tokens
[0]):
continue
if len(tokens
) == 2 and re
.search('[\u3040-\u31FF\uAC00-\uD7AF\u1100-\u11FF]', tokens
[1]):
item
['name'] = tokens
[0]
item
['nameFrn'] = tokens
[1]
db
['details'].update_one({'_id': item
['_id']}, item
)
elif re
.search('[\u3040-\u31FF]', item
['name']):
for i
in range(1, len(tokens
)):
if tokens
[i
] == '':
continue
potential
= cc
.convert(tokens
[i
])
if common_char(potential
[:-1], tokens
[0][:-1]):
item
['name'] = ' '.join(tokens
[:i
])
item
['nameFrn'] = ' '.join(tokens
[i
:])
#
print(item
['name'], '|----|', item
['nameFrn'])
db
['details'].update_one({'_id': item
['_id']}, item
)
break**
成功分割
转载请注明原文地址:https://ipadbbs.8miu.com/read-17299.html