__init__.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. # -*- coding: utf-8 -*-
  2. """
  3. @author:XuMing(xuming624@qq.com)
  4. @description:
  5. """
  6. import os
  7. import six
  8. import warnings
  9. try:
  10. from collections.abc import Iterable
  11. except ImportError:
  12. from collections import Iterable
  13. import pandas as pd
  14. from .structures import AddrMap, Pca
  15. from .structures import P, C, A
  16. __version__ = "0.2.4"
  17. pwd_path = os.path.abspath(os.path.dirname(__file__))
  18. # 区划地址文件
  19. pca_path = os.path.join(pwd_path, 'pca.csv')
  20. if six.PY2:
  21. text_type = unicode
  22. else:
  23. text_type = str
  24. def convert_to_unicode(text):
  25. """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
  26. if not isinstance(text, text_type):
  27. try:
  28. text = text.decode('utf-8')
  29. except UnicodeDecodeError:
  30. text = text.decode('gbk', 'ignore')
  31. except Exception as e:
  32. warnings.warn('Convert to unicode error: %s, text: %s' % (e, text))
  33. return text
  34. def _data_from_csv():
  35. """
  36. 从csv文件获取数据
  37. :return: (AddrMap, AddrMap, AddrMap, dict, dict)
  38. """
  39. # 区名及其简写 -> 相关pca元组
  40. area_map = AddrMap()
  41. # 城市名及其简写 -> 相关pca元组
  42. city_map = AddrMap()
  43. # (省名全称, 区名全称) -> 相关pca元组
  44. province_area_map = AddrMap()
  45. # 省名 -> 省全名
  46. province_map = {}
  47. # (省名, 市名, 区名) -> (纬度,经度)
  48. latlng = {}
  49. # 数据约定:国家直辖市的sheng字段为直辖市名称, 省直辖县的city字段为空
  50. pca_df = pd.read_csv(pca_path, sep=',', header=0, encoding='utf-8')
  51. pca_df = pca_df.fillna('')
  52. for record_dict in pca_df.values:
  53. # latlng[(record_dict['sheng'], record_dict['shi'], record_dict['qu'])] =
  54. # (record_dict['lat'], record_dict['lng'])
  55. record_dict = [convert_to_unicode(i) if i is isinstance(i, text_type) else i for i in record_dict]
  56. latlng[(record_dict[1], record_dict[2], record_dict[3])] = (record_dict[4], record_dict[5])
  57. _fill_province_map(province_map, record_dict)
  58. _fill_area_map(area_map, record_dict)
  59. _fill_city_map(city_map, record_dict)
  60. _fill_province_area_map(province_area_map, record_dict)
  61. return area_map, city_map, province_area_map, province_map, latlng
  62. def _fill_province_area_map(province_area_map, record_dict):
  63. """
  64. 填充省,区
  65. :param province_area_map: AddrMap
  66. :param record_dict:
  67. :return:
  68. """
  69. pca_tuple = (record_dict[1], record_dict[2], record_dict[3])
  70. key = (record_dict[1], record_dict[3])
  71. # 第三个参数在此处没有意义
  72. province_area_map.append_relational_addr(key, pca_tuple, P)
  73. # 过滤混淆区名 '河北北戴河富丽小区1号'
  74. filter_area_names = [u'河北区', u'新城区']
  75. # 处理了部分常见自治县简写
  76. short_area_names = {
  77. u'白沙黎族自治县': u'白沙县',
  78. u'昌江黎族自治县': u'昌江县',
  79. u'乐东黎族自治县': u'乐东县',
  80. u'陵水黎族自治县': u'陵水县',
  81. u'保亭黎族苗族自治县': u'保亭县',
  82. u'琼中黎族苗族自治县': u'琼中县',
  83. u'长阳土家族自治县': u'长阳县',
  84. u'五峰土家族自治县': u'五峰县',
  85. u'大通回族土族自治县': u'大通县',
  86. u'民和回族土族自治县': u'民和县',
  87. u'互助土族自治县': u'互助县',
  88. u'化隆回族自治县': u'化隆县',
  89. u'循化撒拉族自治县': u'循化县',
  90. u'青龙满族自治县': u'青龙县',
  91. u'屏边苗族自治县': u'屏边县',
  92. u'金平苗族瑶族傣族自治县': u'金平县',
  93. u'河口瑶族自治县': u'河口县',
  94. u'丰宁满族自治县': u'丰宁县',
  95. u'宽城满族自治县': u'宽城县',
  96. u'围场满族蒙古族自治县': u'围场县',
  97. }
  98. def _fill_area_map(area_map, record_dict):
  99. """
  100. 填充三级区划(区级)地名,包括简称
  101. :param area_map: AddrMap, dict
  102. :param record_dict: dict
  103. :return: area_map
  104. """
  105. area_name = record_dict[3]
  106. pca_tuple = (record_dict[1], record_dict[2], record_dict[3])
  107. area_map.append_relational_addr(area_name, pca_tuple, A)
  108. # 自治县区划简称
  109. if area_name in short_area_names.keys():
  110. area_map.append_relational_addr(short_area_names[area_name], pca_tuple, A)
  111. # 4字区划简称
  112. elif len(area_name) > 3 and (area_name.endswith(u'新区') or area_name.endswith(u'城区') or area_name.endswith(u'林区')):
  113. area_map.append_relational_addr(area_name[:-2], pca_tuple, A)
  114. # 过滤的区划名称
  115. elif area_name in filter_area_names:
  116. pass
  117. # 3字区划简称,'XX区'不简写
  118. elif len(area_name) > 2 and (area_name.endswith(u'市') or area_name.endswith(u'县')):
  119. area_map.append_relational_addr(area_name[:-1], pca_tuple, A)
  120. # 过滤混淆市名 eg '吉林省、吉林市的混淆'
  121. filter_city_names = [u'吉林市']
  122. def _fill_city_map(city_map, record_dict):
  123. """
  124. 填充二级区划(市级)地名,包括简称
  125. :param city_map: AddrMap, dict
  126. :param record_dict: dict
  127. :return: city_map
  128. """
  129. city_name = record_dict[2] # shi
  130. pca_tuple = (record_dict[1], record_dict[2], record_dict[3])
  131. city_map.append_relational_addr(city_name, pca_tuple, C)
  132. # fix 吉林省、吉林市的混淆
  133. if city_name in filter_city_names:
  134. pass
  135. elif city_name.endswith(u'市'):
  136. city_map.append_relational_addr(city_name[:-1], pca_tuple, C)
  137. # 特别行政区
  138. # elif city_name == u'香港特别行政区':
  139. # city_map.append_relational_addr(u'香港', pca_tuple, C)
  140. # elif city_name == u'澳门特别行政区':
  141. # city_map.append_relational_addr(u'澳门', pca_tuple, C)
  142. # 自治区下的二级区划,eg喀什地区
  143. elif len(city_name) > 3 and city_name.endswith(u'地区'):
  144. city_map.append_relational_addr(city_name[:-2], pca_tuple, C)
  145. def _fill_province_map(province_map, record_dict):
  146. """
  147. 填充一级区划(省级)地名,包括简称
  148. :param province_map: dict
  149. :param record_dict: dict
  150. :return: province_map
  151. """
  152. sheng = record_dict[1] # sheng
  153. if sheng not in province_map:
  154. province_map[sheng] = sheng
  155. # 处理省的简写情况
  156. # 普通省分 和 直辖市
  157. if sheng.endswith(u'省') or sheng.endswith(u'市'):
  158. province_map[sheng[:-1]] = sheng
  159. # 自治区
  160. elif sheng == u'新疆维吾尔自治区':
  161. province_map[u'新疆'] = sheng
  162. elif sheng == u'内蒙古自治区':
  163. province_map['内蒙古'] = sheng
  164. elif sheng == u'广西壮族自治区':
  165. province_map[u'广西'] = sheng
  166. province_map[u'广西省'] = sheng
  167. elif sheng == u'西藏自治区':
  168. province_map[u'西藏'] = sheng
  169. elif sheng == u'宁夏回族自治区':
  170. province_map[u'宁夏'] = sheng
  171. # 特别行政区
  172. elif sheng == u'香港特别行政区':
  173. province_map[u'香港'] = sheng
  174. elif sheng == u'澳门特别行政区':
  175. province_map[u'澳门'] = sheng
  176. area_map, city_map, province_area_map, province_map, latlng = _data_from_csv()
  177. # 直辖市
  178. munis = {u'北京市', u'天津市', u'上海市', u'重庆市'}
  179. def is_munis(city_full_name):
  180. return city_full_name in munis
  181. # 区级到市级的映射
  182. myumap = {
  183. u'南关区': u'长春市',
  184. u'南山区': u'深圳市',
  185. u'宝山区': u'上海市',
  186. u'普陀区': u'上海市',
  187. u'浦东区': u'上海市',
  188. u'市辖区': u'东莞市',
  189. u'朝阳区': u'北京市',
  190. u'河东区': u'天津市',
  191. u'白云区': u'广州市',
  192. u'西湖区': u'杭州市',
  193. u'铁西区': u'沈阳市',
  194. }
  195. def transform(location_strs, umap=myumap, index=[], cut=False, lookahead=8, pos_sensitive=False, open_warning=False):
  196. """将地址描述字符串转换以"省","市","区"信息为列的DataFrame表格
  197. Args:
  198. locations:地址描述字符集合,可以是list, Series等任意可以进行for in循环的集合
  199. 比如:["徐汇区虹漕路461号58号楼5楼", "泉州市洛江区万安塘西工业区"]
  200. umap:自定义的区级到市级的映射,主要用于解决区重名问题,如果定义的映射在模块中已经存在,则会覆盖模块中自带的映射
  201. index:可以通过这个参数指定输出的DataFrame的index,默认情况下是range(len(data))
  202. cut:是否使用分词,默认使用,分词模式速度较快,但是准确率可能会有所下降
  203. lookahead:只有在cut为false的时候有效,表示最多允许向前看的字符的数量
  204. 默认值为8是为了能够发现"新疆维吾尔族自治区"这样的长地名
  205. 如果你的样本中都是短地名的话,可以考虑把这个数字调小一点以提高性能
  206. pos_sensitive:如果为True则会多返回三列,分别提取出的省市区在字符串中的位置,如果字符串中不存在的话则显示-1
  207. open_warning: 是否打开umap警告, 默认关闭
  208. Returns:
  209. 一个Pandas的DataFrame类型的表格,如下:
  210. |省 |市 |区 |地名 |
  211. |上海市|上海市|徐汇区|虹漕路461号58号楼5楼 |
  212. |福建省|泉州市|洛江区|万安塘西工业区 |
  213. """
  214. if not isinstance(location_strs, Iterable):
  215. from .exceptions import InputTypeNotSuportException
  216. raise InputTypeNotSuportException(
  217. 'location_strs参数必须为可迭代的类型(比如list, Series等实现了__iter__方法的对象)')
  218. result = pd.DataFrame(
  219. [_handle_one_record(addr, umap, cut, lookahead, pos_sensitive, open_warning) for addr in location_strs],
  220. index=index) \
  221. if index else pd.DataFrame(
  222. [_handle_one_record(addr, umap, cut, lookahead, pos_sensitive, open_warning) for addr in location_strs])
  223. # 这句的唯一作用是让列的顺序好看一些
  224. if pos_sensitive:
  225. return result.loc[:, ('省', '市', '区', '地名', '省_pos', '市_pos', '区_pos')]
  226. else:
  227. return result.loc[:, ('省', '市', '区', '地名')]
  228. def _handle_one_record(addr, umap, cut, lookahead, pos_sensitive, open_warning):
  229. """处理一条记录"""
  230. addr = convert_to_unicode(addr)
  231. # 空记录
  232. if not addr or not isinstance(addr, text_type):
  233. empty = {'省': '', '市': '', '区': ''}
  234. if pos_sensitive:
  235. empty['省_pos'] = -1
  236. empty['市_pos'] = -1
  237. empty['区_pos'] = -1
  238. return empty
  239. # 地名提取
  240. pca, left_addr = _extract_addr(addr, cut, lookahead)
  241. # 填充市
  242. _fill_city(pca, umap, open_warning)
  243. # 填充省
  244. _fill_province(pca)
  245. result = pca.propertys_dict(pos_sensitive)
  246. result['地名'] = left_addr
  247. return result
  248. def _fill_province(pca):
  249. """填充省"""
  250. if (not pca.province) and pca.city and (pca.city in city_map):
  251. pca.province = city_map.get_value(pca.city, P)
  252. def _fill_city(pca, umap, open_warning):
  253. """填充市"""
  254. if not pca.city:
  255. # 从 区 映射
  256. if pca.area:
  257. # 从umap中映射
  258. if umap.get(pca.area):
  259. pca.city = umap.get(pca.area)
  260. return
  261. if pca.area in area_map and area_map.is_unique_value(pca.area):
  262. if pca.province:
  263. if area_map.get_value(pca.area, P) == pca.province:
  264. pca.city = area_map.get_value(pca.area, C)
  265. return
  266. else:
  267. pca.city = area_map.get_value(pca.area, C)
  268. return
  269. # 从 省,区 映射
  270. if pca.area and pca.province:
  271. newKey = (pca.province, pca.area)
  272. if newKey in province_area_map and province_area_map.is_unique_value(newKey):
  273. pca.city = province_area_map.get_value(newKey, C)
  274. return
  275. if open_warning:
  276. warnings.warn("%s 无法映射, 建议添加进umap中" % pca.area)
  277. def _extract_addr(addr, cut, lookahead):
  278. """提取地址中的省,市,区名称
  279. Args:
  280. addr:原始地址字符串
  281. cut: 是否分词
  282. Returns:
  283. [sheng, shi, qu, (sheng_pos, shi_pos, qu_pos)], addr
  284. """
  285. return _jieba_extract(addr) if cut else _full_text_extract(addr, lookahead)
  286. def _jieba_extract(addr):
  287. """基于结巴分词进行提取"""
  288. import jieba
  289. result = Pca()
  290. pos = 0
  291. truncate = {0: 0}
  292. def _set_pca(pca_property, name, full_name):
  293. """pca_property: 'province', 'city' or 'area'"""
  294. if not getattr(result, pca_property):
  295. setattr(result, pca_property, full_name)
  296. setattr(result, pca_property + "_pos", pos)
  297. if is_munis(full_name):
  298. setattr(result, "province_pos", pos)
  299. # nonlocal truncate, replace with dict
  300. # refer: https://www.it610.com/article/50433.htm
  301. if pos == truncate[0]:
  302. truncate[0] += len(name)
  303. for word in jieba.cut(addr):
  304. # 优先提取低级别行政区 (主要是为直辖市和特别行政区考虑)
  305. if word in area_map:
  306. _set_pca('area', word, area_map.get_full_name(word))
  307. elif word in city_map:
  308. _set_pca('city', word, city_map.get_full_name(word))
  309. elif word in province_map:
  310. _set_pca('province', word, province_map[word])
  311. pos += len(word)
  312. return result, addr[truncate[0]:]
  313. filter_address_chars = [u'路', u'街', u'村', u'桥']
  314. def _full_text_extract(addr, lookahead):
  315. """全文匹配进行提取"""
  316. result = Pca()
  317. truncate = {0: 0}
  318. def _set_pca(pca_property, pos, name, full_name):
  319. """pca_property: 'province', 'city' or 'area'"""
  320. def _defer_set():
  321. if not getattr(result, pca_property):
  322. setattr(result, pca_property, full_name)
  323. setattr(result, pca_property + "_pos", pos)
  324. if is_munis(full_name):
  325. setattr(result, "province_pos", pos)
  326. # nonlocal truncate
  327. if pos == truncate[0]:
  328. truncate[0] += len(name)
  329. return len(name)
  330. return _defer_set
  331. # i为起始位置
  332. i = 0
  333. while i < len(addr):
  334. # 用于设置pca属性的函数
  335. defer_fun = None
  336. # length为从起始位置开始的长度,从中提取出最长的地址
  337. for length in range(1, lookahead + 1):
  338. end_pos = i + length
  339. if end_pos > len(addr):
  340. break
  341. word = addr[i:end_pos]
  342. word_next = addr[end_pos] if end_pos < len(addr) else ''
  343. # 优先提取低级别的行政区 (主要是为直辖市和特别行政区考虑)
  344. if word_next in filter_address_chars:
  345. continue
  346. elif word in area_map:
  347. defer_fun = _set_pca('area', i, word, area_map.get_full_name(word))
  348. continue
  349. elif word in city_map:
  350. defer_fun = _set_pca('city', i, word, city_map.get_full_name(word))
  351. continue
  352. elif word in province_map:
  353. defer_fun = _set_pca('province', i, word, province_map[word])
  354. continue
  355. if defer_fun:
  356. i += defer_fun()
  357. else:
  358. i += 1
  359. return result, addr[truncate[0]:]