python中的urlencode与urldecode

当url地址含有中文,或者参数有中文的时候,这个算是很正常了,但是把这样的url作为参数传递的时候(最常见的callback),需要把一些中文甚至'/'做一下编码转换。

一、urlencode

urllib库里面有个urlencode函数,可以把key-value这样的键值对转换成我们想要的格式,返回的是a=1&b=2这样的字符串,比如:

1
2
3
4
5
6
7
>>> from urllib import urlencode 
>>> data = 
... 'a''test'
... 'name''魔兽' 
... } 
>>> print urlencode(data) 
a=test&name=%C4%A7%CA%DE

如果只想对一个字符串进行urlencode转换,怎么办?urllib提供另外一个函数:quote()

1
2
3
>>> from urllib import quote 
>>> quote('魔兽'
'%C4%A7%CA%DE'

二、urldecode

当urlencode之后的字符串传递过来之后,接受完毕就要解码了——urldecode。urllib提供了unquote()这个函数,可没有urldecode()!

1
2
3
4
>>> from urllib import unquote 
>>> unquote('%C4%A7%CA%DE''\xc4\xa7\xca\xde' 
>>> print unquote('%C4%A7%CA%DE'
魔兽

三、讨论

在做urldecode的时候,看unquote()这个函数的输出,是对应中文在gbk下的编码,在对比一下quote()的结果不难发现,所谓的urlencode就是把字符串转车gbk编码,然后把\x替换成%。如果你的终端是utf8编码的,那么要把结果再转成utf8输出,否则就乱码。

可以根据实际情况,自定义或者重写urlencode()、urldecode()等函数。

python爬虫练手

51无聊谢了个爬虫玩

python写这类小东西真的很方便

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
__author__='LeoKim'
from bs4 import BeautifulSoup
  
import re
import urllib.request, urllib.parse, http.cookiejar
import json
import time
import pymysql
 
conn=pymysql.connect(host='localhost',user='root',passwd='superhero',db='python_test',port=3306,charset='utf8')
cur=conn.cursor()#获取一个游标
 
#通过链接获取每页的小区名
def getVillage(url):
    cj = http.cookiejar.CookieJar()
    opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
    opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'),
    ('Cookie''select_city=320100; lianjia_uuid=c73af582-9ed7-42ed-9738-bbd4688c67e0; UM_distinctid=15bb9f33ca387c-0ac15874ad5d0d-6a11157a-1fa400-15bb9f33ca4a02; _jzqckmp=1; all-lj=c28812af28ef34a41ba2474a2b5c52c2; _jzqx=1.1493473537.1493544561.2.jzqsr=nj%2Elianjia%2Ecom|jzqct=/ershoufang/gulou/.jzqsr=nj%2Elianjia%2Ecom|jzqct=/xiaoqu/pg1/; _gat=1; _gat_past=1; _gat_global=1; _gat_new_global=1; _gat_dianpu_agent=1; _smt_uid=59049861.8870a59; CNZZDATA1253492138=835595246-1493470448-null%7C1493541950; CNZZDATA1254525948=1922726511-1493470772-null%7C1493540995; CNZZDATA1255633284=630946367-1493469955-null%7C1493543402; CNZZDATA1255604082=270979082-1493468920-null%7C1493544528; _qzja=1.1520598967.1493473405458.1493480837509.1493544561423.1493544849473.1493544851953.0.0.0.29.3; _qzjb=1.1493544561423.10.0.0.0; _qzjc=1; _qzjto=10.1.0; _jzqa=1.2414222906473966000.1493473537.1493480838.1493544561.3; _jzqc=1; _jzqb=1.10.10.1493544561.1; _ga=GA1.2.1108117219.1493473408; _gid=GA1.2.2091828031.1493544853; lianjia_ssid=5c8ebd96-81f4-4430-bfda-6d941fcb8663')]
 
    urllib.request.install_opener(opener)
 
    html_bytes = urllib.request.urlopen(url).read()
    html_string = html_bytes.decode('utf-8')
    return html_string
 
 
 
def start(start_url):
    html_doc = getVillage(start_url)
    soup = BeautifulSoup(html_doc, 'html.parser')
 
    #获取所有页数和现在页数
    totalPageNoDiv=soup.find("div","house-lst-page-box")
    Page = eval(totalPageNoDiv.attrs['page-data'])
 
    totalPageNo = Page['totalPage']
    curPage = Page['curPage']
 
    print('当前正在抓取第'+str(curPage)+'页,共'+str(totalPageNo)+'页.')
 
 
    #获取小区内容
    divs = soup.find_all("div","title")
    for div in divs:
        a_tag = div.find("a",target="_blank")
        if(a_tag):
            #插入数据库
            sql = "INSERT INTO `village` (`name`) VALUES (%s)"
            cur.execute(sql, (a_tag.string))
 
    curPage = curPage + 1;
    if(totalPageNo == curPage-1):
        print('执行完毕.')
    else:
        time.sleep(10)
        start_url = "http://nj.lianjia.com/xiaoqu/pg"+str(curPage)
        start(start_url)
 
 
totalPageNo=1
curPage=1
 
start_url = "http://nj.lianjia.com/xiaoqu/pg"+str(curPage)
start(start_url)
 
 
cur.close()#关闭游标
conn.close()#释放数据库资源
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
__author__='LeoKim'
from bs4 import BeautifulSoup
import pymysql
import urllib.request, urllib.parse, http.cookiejar
from urllib import parse
import pymysql
 
conn=pymysql.connect(host='localhost',user='root',passwd='superhero',db='python_test',port=3306,charset='utf8')
cur=conn.cursor()#获取一个游标
 
def getgeohash(keyword):
    key={
        'keyword':keyword
    }
 
 
    url='https://mainsite-restapi.ele.me/v2/pois?extras%5B%5D=count&geohash=wtsm0ss7yfj8&limit=20&type=nearby&'+parse.urlencode(key)
 
    cj = http.cookiejar.CookieJar()
    opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
    opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'),
    ('Cookie''ubt_ssid=but8xnmtkpfrbvypd9z3hxaa5i8ugmj0_2017-04-29; _utrace=edd9bb6de13caed667d2cf273d73fc0a_2017-04-29')]
 
    urllib.request.install_opener(opener)
 
    html_bytes = urllib.request.urlopen(url).read()
    html_string = html_bytes.decode('utf-8')
    soup = BeautifulSoup(html_string, 'html.parser')
 
    try:
        info = eval(soup.prettify())
        if len(info) and info is not None:
            return info[0]
        else:
            return 'error'
    except:
        return 'error'
     
 
sql = "SELECT id,name FROM `village` where geohash is null"
cur.execute(sql)
data = cur.fetchall()
 
for in data:
    print(d[0])
 
    geohash=''
    latitude=''
    longitude=''
 
    gh=getgeohash(d[1])
 
    if gh=='error':
        geohash='error'
        latitude=''
        longitude=''
    else:
        geohash = gh['geohash']
        latitude = gh['latitude']
        longitude = gh['longitude']
 
    print(geohash,latitude,longitude)
 
# gh['geohash'] is None
 
    sql = "UPDATE `village` SET geohash=%s,latitude=%s,longitude=%s where id=%s"
    cur.execute(sql, (geohash,latitude,longitude,d[0]))
     
cur.close()#关闭游标
conn.close()#释放数据库资源
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
__author__='LeoKim'
from bs4 import BeautifulSoup
import pymysql
import urllib.request, urllib.parse, http.cookiejar
from urllib import parse
import pymysql
import json
import re
 
conn=pymysql.connect(host='localhost',user='root',passwd='superhero',db='python_test',port=3306,charset='utf8')
cur=conn.cursor()#获取一个游标
 
def getstore(village_id,geohash,latitude,longitude,limit):
    key={
        'geohash':geohash,
        'latitude':latitude,
        'longitude':longitude,
        'limit':limit
    }
 
 
    url='https://mainsite-restapi.ele.me/shopping/restaurants?extras%5B%5D=activities&offset=0&terminal=web'+parse.urlencode(key)
 
    cj = http.cookiejar.CookieJar()
    opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
    opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'),
    ('Cookie''ubt_ssid=but8xnmtkpfrbvypd9z3hxaa5i8ugmj0_2017-04-29; _utrace=edd9bb6de13caed667d2cf273d73fc0a_2017-04-29')]
 
    urllib.request.install_opener(opener)
 
    html_bytes = urllib.request.urlopen(url).read()
    html_string = html_bytes.decode('utf-8')
    soup = BeautifulSoup(html_string, 'html.parser')
 
    info = soup.prettify()
    jsonData = json.loads(info)
     
 
    for data in jsonData:
 
        print(data['id'])
        print(village_id)
        print(data['name'])
        print(data['recent_order_num'])
        print(data['address'])
        print(data['order_lead_time'])
        print(data['float_delivery_fee'])
 
        average_cost=0
        if 'average_cost' in data:
            cost = re.findall(r'\d+', data['average_cost'])
            average_cost=cost[0]
            print(average_cost)
 
        print(data['rating'])
        print('---------------------------------------------')
 
        shop_id = data['id']
        name = data['name']
        address = data['address']
        recent_order_num = data['recent_order_num']
        order_lead_time = data['order_lead_time']
        float_delivery_fee = data['float_delivery_fee']
        rating = data['rating']
 
        sql = "INSERT INTO `store` (`shop_id`,`village_id`,`name`,`address`,`recent_order_num`,`order_lead_time`,`float_delivery_fee`, `average_cost`, `rating`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)"
        cur.execute(sql, (shop_id,village_id,name, address, recent_order_num, order_lead_time, float_delivery_fee, average_cost, rating))
 
 
# getstore('wtst84g4g0u','31.91988','118.83238',30)
 
sql = "SELECT id,name,geohash,latitude,longitude FROM `village` where id >482 and geohash is not null"
cur.execute(sql)
data = cur.fetchall()
 
for in data:
    village_id=d[0]
    geohash = d[2]
    latitude = d[3]
    longitude = d[4]
 
    getstore(village_id,geohash,latitude,longitude,30)
 
 
cur.close()#关闭游标
conn.close()#释放数据库资源