# -*- coding: utf-8 -*- # !/usr/bin/env python import os, sys import urllib import requests from mongoengine import register_connection, PointField, DynamicDocument, StringField import simplejson as json PROJECT_ROOT = os.path.join(os.path.abspath(os.path.split(os.path.realpath(__file__))[0] + "/.."), '..') sys.path.insert(0, PROJECT_ROOT) os.environ.setdefault("DJANGO_SETTINGS_MODULE", "configs.testing") from script.base import init_env init_env(interactive = False) CITY_MAP = { 'hf': u'合肥' } register_connection(alias = 'spider', name = 'spider', host = '127.0.0.1', port = 27017) class Village(DynamicDocument): city = StringField(default = "") area = StringField(default = "") name = StringField(default = '') type = StringField(default = "") address = StringField(default = "") location = PointField(default = None) alias = StringField(default = '') meta = { 'collection': 'village', 'db_alias': 'spider', 'unique_together': {'city', 'area', 'name'} } def get_location(name): url_template = 'https://apis.map.qq.com/jsapi?qt=geoc&addr={}&key=FBOBZ-VODWU-C7SVF-B2BDI-UK3JE-YBFUS&output=jsonp&pf=jsapi&ref=jsapi' url = url_template.format(urllib.quote(name)) print url strhtml = requests.get(url, timeout = 15).text result = json.loads(strhtml) print result longitude = float(result['detail']['pointx']) latitude = float(result['detail']['pointy']) return { 'type': 'Point', 'coordinates': [longitude, latitude] } def get_location2(name): url_template = 'https://apis.map.qq.com/jsapi?qt=poi&wd={}&pn=0&rn=10&rich_source=qipao&rich=web&nj=0&c=1&key=FBOBZ-VODWU-C7SVF-B2BDI-UK3JE-YBFUS&pf=jsapi&ref=jsapi' url = url_template.format(name) strhtml = requests.get(url, timeout = 15).text result = json.loads(strhtml) try: poi = result['detail']['pois'][0] longitude = float(poi['pointx']) latitude = float(poi['pointy']) return { 'type': 'Point', 'coordinates': [longitude, latitude] } except Exception as e: print e.message print result print url.encode('utf-8') def spider_one_city(city, local_name): curr_page = 1 total_page = 9999 while curr_page < total_page: print 'curr = {}, total = {}'.format(curr_page, total_page) url = 'https://m.58.com/xiaoquweb/getXiaoquList/?city={city}&key=&page={page}&price=&sort=&completiontime=&latlon=&stationid='.format( city = city, page = curr_page) strhtml = requests.get(url, timeout = 15).text result = json.loads(strhtml) dto_page = result['data']['pageDTO'] if int(dto_page['totalPage']) > total_page or total_page == 9999: total_page = int(dto_page['totalPage']) info_list = result['data']['infoList'] for item in info_list: address = item['address'] alias = item['alias'] area = item['areaName'] village_type = item['infoParamEntity']['map']['propertytype'] name = item['name'] try: village = Village(city = local_name, area = area, name = name, type = village_type, address = address, alias = alias) village.save() except Exception as e: print e.message curr_page = curr_page + 1 try: spider_one_city('su', u'苏州') except Exception as e: print e.message items = Village.objects.filter(city = u'苏州', location = None) for item in items: try: find_name = u'{} {} {}'.format(item.city, item.area, item.name) item.location = get_location2(find_name.encode('utf8')) item.save() except Exception as e: print e.message