tiantian_pic_2_port.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. # -*- coding: utf-8 -*-
  2. # !/usr/bin/env python
  3. import os, sys,time,datetime
  4. import urllib
  5. import requests
  6. from mongoengine import register_connection, PointField, DynamicDocument, StringField
  7. import simplejson as json
  8. import base64
  9. import sys
  10. import json
  11. import base64
  12. import urllib2
  13. from urllib import quote_plus
  14. from urllib2 import urlopen
  15. from urllib2 import Request
  16. from urllib2 import URLError
  17. from urllib import urlencode
  18. from django.db.models.fields import DateTimeField
  19. PROJECT_ROOT = os.path.join(os.path.abspath(os.path.split(os.path.realpath(__file__))[0] + "/.."), '..')
  20. sys.path.insert(0, PROJECT_ROOT)
  21. os.environ.setdefault("DJANGO_SETTINGS_MODULE", "configs.testing")
  22. from script.base import init_env
  23. init_env(interactive = False)
  24. from apps.web.core.db import Searchable
  25. # 防止https证书校验不正确
  26. import ssl
  27. register_connection(alias = 'spider',
  28. name = 'spider',
  29. host = '116.62.228.194',
  30. port = 27017,
  31. username = 'dba',
  32. password = 'dayuan@2020..',
  33. authentication_source = 'admin')
  34. class tiantianPort(Searchable):
  35. addr = StringField(default = '')
  36. portId = StringField(default = '')
  37. meta = {
  38. 'collection': 'tiantian_port',
  39. 'db_alias': 'spider',
  40. 'unique_together': {'portId'}
  41. }
  42. IS_PY3 = sys.version_info.major == 3
  43. if IS_PY3:
  44. from urllib.request import urlopen
  45. from urllib.request import Request
  46. from urllib.error import URLError
  47. from urllib.parse import urlencode
  48. from urllib.parse import quote_plus
  49. else:
  50. import urllib2
  51. from urllib import quote_plus
  52. from urllib2 import urlopen
  53. from urllib2 import Request
  54. from urllib2 import URLError
  55. from urllib import urlencode
  56. ssl._create_default_https_context = ssl._create_unverified_context
  57. # 利用百度APP,直接解析截图中的地址,以及端口编号。
  58. API_KEY = 'OVcN78LP40CBEwWk5REF2Hyu'
  59. SECRET_KEY = 'a7luZBdbzjsfU9oE2GD3yPeTBgPty03t'
  60. OCR_URL = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic"
  61. TOKEN_URL = 'https://aip.baidubce.com/oauth/2.0/token'
  62. def fetch_token():
  63. params = {'grant_type': 'client_credentials',
  64. 'client_id': API_KEY,
  65. 'client_secret': SECRET_KEY}
  66. post_data = urlencode(params)
  67. if (IS_PY3):
  68. post_data = post_data.encode('utf-8')
  69. req = Request(TOKEN_URL, post_data)
  70. try:
  71. f = urlopen(req, timeout=5)
  72. result_str = f.read()
  73. except URLError as err:
  74. print(err)
  75. if (IS_PY3):
  76. result_str = result_str.decode()
  77. result = json.loads(result_str)
  78. if ('access_token' in result.keys() and 'scope' in result.keys()):
  79. if not 'brain_all_scope' in result['scope'].split(' '):
  80. print ('please ensure has check the ability')
  81. exit()
  82. return result['access_token']
  83. else:
  84. print ('please overwrite the correct API_KEY and SECRET_KEY')
  85. exit()
  86. request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/general_basic" # accurate_basic
  87. def read_file(image_path):
  88. f = None
  89. try:
  90. f = open(image_path, 'rb')
  91. return f.read()
  92. except:
  93. print('read image file fail')
  94. return None
  95. finally:
  96. if f:
  97. f.close()
  98. def request(url, data):
  99. req = Request(url, data.encode('utf-8'))
  100. has_error = False
  101. try:
  102. f = urlopen(req)
  103. result_str = f.read()
  104. if (IS_PY3):
  105. result_str = result_str.decode()
  106. return result_str
  107. except URLError as err:
  108. print(err)
  109. # 二进制方式打开图片文件
  110. # 获取access token
  111. token = fetch_token()
  112. # 拼接通用文字识别高精度url
  113. rootdir = u'F:/爬虫相关/天天充电/端口/'
  114. listFile = os.listdir(rootdir) # 列出文件夹下所有的目录与文件
  115. for i in range(0, len(listFile)):
  116. print i
  117. try:
  118. path = os.path.join(rootdir, listFile[i])
  119. f = open(path, 'rb')
  120. img = base64.b64encode(f.read())
  121. if f:
  122. f.close()
  123. # 调用文字识别服务
  124. params = {"image":img}
  125. access_token = token
  126. request_url = request_url + "?access_token=" + token
  127. headers = {'content-type': 'application/x-www-form-urlencoded'}
  128. response = requests.post(request_url, data=params, headers=headers)
  129. # if response:
  130. # print (response.json())
  131. # 解析返回结果
  132. result_json = response.json()
  133. addr = ''
  134. for words_result in result_json["words_result"]:
  135. text = words_result["words"]
  136. zeroIndex = text.find('0')
  137. if zeroIndex >= 0 :
  138. addr += text[0:zeroIndex]
  139. portId = text[zeroIndex::]
  140. tiantianPort.get_collection().update({'portId':portId},{'$set':{'portId':portId,'addr':addr}},upsert = True)
  141. addr = ''
  142. else:
  143. addr = text
  144. except Exception:
  145. continue
  146. # 打印文字
  147. print('OK')