# -*- coding: utf-8 -*- # !/usr/bin/env python import os, sys,time,datetime import urllib import requests from mongoengine import register_connection, PointField, DynamicDocument, StringField import simplejson as json import base64 import sys import json import base64 import urllib2 from urllib import quote_plus from urllib2 import urlopen from urllib2 import Request from urllib2 import URLError from urllib import urlencode from django.db.models.fields import DateTimeField import shutil PROJECT_ROOT = os.path.join(os.path.abspath(os.path.split(os.path.realpath(__file__))[0] + "/.."), '..') sys.path.insert(0, PROJECT_ROOT) os.environ.setdefault("DJANGO_SETTINGS_MODULE", "configs.testing") from script.base import init_env init_env(interactive = False) from apps.web.core.db import Searchable # 防止https证书校验不正确 import ssl register_connection(alias = 'spider', name = 'spider', host = '211.159.224.10', port = 27017, username = 'dba', password = 'dayuan@2020..', authentication_source = 'admin') class xingxingStation(Searchable): name = StringField(default = '') servicePhone = StringField(default = '') company = StringField(default = '') meta = { 'collection': 'xingxing_station', 'db_alias': 'spider', 'unique_together': {'name'} } IS_PY3 = sys.version_info.major == 3 if IS_PY3: from urllib.request import urlopen from urllib.request import Request from urllib.error import URLError from urllib.parse import urlencode from urllib.parse import quote_plus else: import urllib2 from urllib import quote_plus from urllib2 import urlopen from urllib2 import Request from urllib2 import URLError from urllib import urlencode ssl._create_default_https_context = ssl._create_unverified_context # 利用百度APP,直接解析截图中的地址,以及端口编号。 API_KEY = 'OVcN78LP40CBEwWk5REF2Hyu' SECRET_KEY = 'a7luZBdbzjsfU9oE2GD3yPeTBgPty03t' OCR_URL = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic" TOKEN_URL = 'https://aip.baidubce.com/oauth/2.0/token' def fetch_token(): params = {'grant_type': 'client_credentials', 'client_id': API_KEY, 'client_secret': SECRET_KEY} post_data = urlencode(params) if (IS_PY3): post_data = post_data.encode('utf-8') req = Request(TOKEN_URL, post_data) try: f = urlopen(req, timeout=5) result_str = f.read() except URLError as err: print(err) if (IS_PY3): result_str = result_str.decode() result = json.loads(result_str) if ('access_token' in result.keys() and 'scope' in result.keys()): if not 'brain_all_scope' in result['scope'].split(' '): print ('please ensure has check the ability') exit() return result['access_token'] else: print ('please overwrite the correct API_KEY and SECRET_KEY') exit() request_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/general_basic" # accurate_basic def read_file(image_path): f = None try: f = open(image_path, 'rb') return f.read() except: print('read image file fail') return None finally: if f: f.close() def request(url, data): req = Request(url, data.encode('utf-8')) has_error = False try: f = urlopen(req) result_str = f.read() if (IS_PY3): result_str = result_str.decode() return result_str except URLError as err: print(err) # 二进制方式打开图片文件 # 获取access token token = fetch_token() # 拼接通用文字识别高精度url # rootdir = u'Q:/友商信息/汽车桩/星星充电/站列表截图' listFile = os.listdir(rootdir) # 列出文件夹下所有的目录与文件 resultList = [] for i in range(0, len(listFile)): print i try: path = os.path.join(rootdir, listFile[i]) f = open(path, 'rb') img = base64.b64encode(f.read()) if f: f.close() # 调用文字识别服务 params = {"image":img} access_token = token request_url1 = request_url + "?access_token=" + token headers = {'content-type': 'application/x-www-form-urlencoded'} response = requests.post(request_url1, data=params, headers=headers) # if response: # print (response.json()) # 解析返回结果 result_json = response.json() needCopy = False for words_result in result_json["words_result"]: text = words_result["words"] if u'他营' in text or u'联营' in text: needCopy = True if needCopy: print 'get one',i shutil.copyfile(rootdir+'/' + listFile[i], 'Q:/友商信息/汽车桩/星星充电/他营/%s' % listFile[i]) except Exception,e: continue resultList = list(set(resultList)) for result in resultList: print result # 打印文字 print('OK')