123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174 |
- # -*- coding: utf-8 -*-
- # !/usr/bin/env python
- import os, sys,time,datetime
- import urllib
- import requests
- from mongoengine import register_connection, PointField, DynamicDocument, StringField
- import simplejson as json
- import base64
- import sys
- import json
- import base64
- import urllib2
- from urllib import quote_plus
- from urllib2 import urlopen
- from urllib2 import Request
- from urllib2 import URLError
- from urllib import urlencode
- from django.db.models.fields import DateTimeField
- PROJECT_ROOT = os.path.join(os.path.abspath(os.path.split(os.path.realpath(__file__))[0] + "/.."), '..')
- sys.path.insert(0, PROJECT_ROOT)
- os.environ.setdefault("DJANGO_SETTINGS_MODULE", "configs.testing")
- from script.base import init_env
- init_env(interactive = False)
- from apps.web.core.db import Searchable
- # 防止https证书校验不正确
- import ssl
- register_connection(alias = 'spider',
- name = 'spider',
- host = '116.62.228.194',
- port = 27017,
- username = 'dba',
- password = 'dayuan@2020..',
- authentication_source = 'admin')
- class tiantianPort(Searchable):
- addr = StringField(default = '')
- portId = StringField(default = '')
-
- meta = {
- 'collection': 'tiantian_port',
- 'db_alias': 'spider',
- 'unique_together': {'portId'}
- }
- IS_PY3 = sys.version_info.major == 3
- if IS_PY3:
- from urllib.request import urlopen
- from urllib.request import Request
- from urllib.error import URLError
- from urllib.parse import urlencode
- from urllib.parse import quote_plus
- else:
- import urllib2
- from urllib import quote_plus
- from urllib2 import urlopen
- from urllib2 import Request
- from urllib2 import URLError
- from urllib import urlencode
-
- ssl._create_default_https_context = ssl._create_unverified_context
- # 利用百度APP,直接解析截图中的地址,以及端口编号。
- API_KEY = 'OVcN78LP40CBEwWk5REF2Hyu'
- SECRET_KEY = 'a7luZBdbzjsfU9oE2GD3yPeTBgPty03t'
- OCR_URL = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic"
- TOKEN_URL = 'https://aip.baidubce.com/oauth/2.0/token'
- def fetch_token():
- params = {'grant_type': 'client_credentials',
- 'client_id': API_KEY,
- 'client_secret': SECRET_KEY}
- post_data = urlencode(params)
- if (IS_PY3):
- post_data = post_data.encode('utf-8')
- req = Request(TOKEN_URL, post_data)
- try:
- f = urlopen(req, timeout=5)
- result_str = f.read()
- except URLError as err:
- print(err)
-
- if (IS_PY3):
- result_str = result_str.decode()
-
- result = json.loads(result_str)
- if ('access_token' in result.keys() and 'scope' in result.keys()):
- if not 'brain_all_scope' in result['scope'].split(' '):
- print ('please ensure has check the ability')
- exit()
- return result['access_token']
- else:
- print ('please overwrite the correct API_KEY and SECRET_KEY')
- exit()
-
- request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/general_basic" # accurate_basic
- def read_file(image_path):
- f = None
- try:
- f = open(image_path, 'rb')
- return f.read()
- except:
- print('read image file fail')
- return None
- finally:
- if f:
- f.close()
-
- def request(url, data):
- req = Request(url, data.encode('utf-8'))
- has_error = False
- try:
- f = urlopen(req)
- result_str = f.read()
- if (IS_PY3):
- result_str = result_str.decode()
- return result_str
- except URLError as err:
- print(err)
-
- # 二进制方式打开图片文件
- # 获取access token
- token = fetch_token()
- # 拼接通用文字识别高精度url
- rootdir = u'F:/爬虫相关/天天充电/端口/'
- listFile = os.listdir(rootdir) # 列出文件夹下所有的目录与文件
- for i in range(0, len(listFile)):
- print i
- try:
- path = os.path.join(rootdir, listFile[i])
- f = open(path, 'rb')
- img = base64.b64encode(f.read())
- if f:
- f.close()
- # 调用文字识别服务
- params = {"image":img}
- access_token = token
- request_url = request_url + "?access_token=" + token
- headers = {'content-type': 'application/x-www-form-urlencoded'}
- response = requests.post(request_url, data=params, headers=headers)
- # if response:
- # print (response.json())
-
- # 解析返回结果
- result_json = response.json()
- addr = ''
- for words_result in result_json["words_result"]:
- text = words_result["words"]
- zeroIndex = text.find('0')
- if zeroIndex >= 0 :
- addr += text[0:zeroIndex]
- portId = text[zeroIndex::]
- tiantianPort.get_collection().update({'portId':portId},{'$set':{'portId':portId,'addr':addr}},upsert = True)
- addr = ''
- else:
- addr = text
- except Exception:
- continue
- # 打印文字
- print('OK')
|