curl -d "a=3&b=99&c=100" http://httpbin.org/post 字符串形式传参数
curl -d @./post.data http://httpbin.org/post 可以指定文件名字参数,文件中含有要传递的参数
curl -o fox.jpeg http://httpbin.org/image/jpeg
大写的 O 远程的文件名保存
curl -o image.webp -H "accept:image/webp" http://httpbin.org/image webp图片格式google的,在不缩小图片清晰度情况下减少图片的尺寸
SSL 请求 12306 certificate subject name (webssl.chinanetcenter.com) does not match target host name '12306.cn'
g k 向前向后翻页 ctrl + b ctrl + f q退出
提取自己的IP地址
curl http://httpbin.org/get|grep -E "origin" | cut -d '"' -f4
search / match / findall
alias myip="curl -s http://httpbin.org/get|grep -E 'origin' | cut -d '\"' -f4"
wget
wget -O test.png http://httpbin.org/image/png
httpie
使用Python启动一个简易的文件传输服务器
PostMan
抓包软件 青花瓷 fiddler
python3标准库
(一):urllib.request----------------------->
import urllib.parse
import json
url = 'http://httpbin.org/get'
resp = urllib.request.urlopen(url)
print(resp)
text = resp.read()
print(type(text.decode('utf-8')))
print(type(json.loads(text.decode('utf-8'))))
<http.client.HTTPResponse object at 0x7fde52fa3780>
<class 'str'>
<class 'dict'>
dict 转字典
添加自定义头信息 -------------------->
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
"AppleWebKit/537.36 (KHTML, like Gecko)" \
" Chrome/83.0.4103.97 Safari/537.36"
#创建请求
req = urllib.request.Request('http://httpbin.org/user-agent')
req.add_header('User_Agent',ua)
r = urllib.request.urlopen(req)
text = r.read()
resp = json.loads(text.decode('utf-8'))
print(resp['user-agent'])
(二)parse
get参数 / post参数
import urllib
params = urllib.parse.urlencode({
'spam':1,
'egg':2
})
url = 'http://httpbin.org/get?%s' % params
with urllib.request.urlopen(url) as f:
print(f.read().decode('utf-8'))
data = urllib.parse.urlencode({
'name':'zyl',
'pass':'199802'
})
data = data.encode()
with urllib.request.urlopen('http://httpbin.org/post',data) as f:
print(f.read().decode('utf-8'))
(三):使用代理
proxy_handle = urllib.request.ProxyHandler(
{
'http':'183.166.71.109:9999'
}
)
#带用户名字和密码的代理
# proxy_auth_handler = request.ProxyBasicAuthHandler()
opener = urllib.request.build_opener(proxy_handle)
r = opener.open('http://httpbin.org/ip')
print(r.read().decode('utf-8'))
(四)parse
from urllib import parse
o = parse.urlparse("http://zyl:199802httpbin.org/get?a=2&c=88#test")
print(type(o))
print(dir(o))
print(o.netloc)
print(o.scheme)
print(o.query)
print(o.fragment)
import requests
r = requests.get('http://httpbin.org/get')
print(r.status_code,r.reason)
print(r.text)
r = requests.post("http://httpbin.org/post",data={'a':99999})
print(r.status_code, r.reason)
print(r.text)
print(r.json())
#有参数的 get请求
r = requests.get("http://httpbin.org/get",params={'a':99999})
print(r.status_code, r.reason)
print(r.text)
print(r.json())
#user-agent
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
"AppleWebKit/537.36 (KHTML, like Gecko)" \
" Chrome/83.0.4103.97 Safari/537.36"
headers = {'User-Agent':ua}
r = requests.get('http://httpbin.org/headers',headers=headers)
print(r.status_code, r.reason)
print(r.text)
#cookie
print('-------------------------')
cookies = dict(userid = '12345', token='xxxxxxxxxxxxxxxxyyyyyy')
r = requests.get('http://httpbin.org/cookies',cookies = cookies)
print(r.json())
#认证
print('++++++++++++++++++认证')
r = requests.get('http://httpbin.org/basic-auth/zyl/123456',auth=('zyl','123456'))
print(r.json())
400错误是客户端错误
异常抛出
import requests
print('异常抛出')
bad_r = requests.get('http://httpbin.org/status/400')
bad_r.raise_for_status()
sessions
import requests
s = requests.Session()
#保存 cookies
s.get('http://httpbin.org/cookies/set/userid/123456')
r = s.get('http://httpbin.org/cookies')
print(r.json())
代理
r = requests.get('http://httpbin.org/ip')
print(r.json())
print('shiyongdaili')
r = requests.get('http://httpbin.org/ip',proxies={'http':'113.120.32.98:9999'})
print(r.json())
超时
r = requests.get('http://httpbin.org/delay/4',timeout=5)
print(r.text)
官方网站 https://www.crummy.com/software/BeautifulSoup/bs4/doc/
import bs4
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
# print(soup.prettify())
print(soup.title)
print(soup.a.attrs)
#取出来所有的地址
for a in soup.find_all('a'):
print(a.attrs['href'])
#指定结点
print(soup.find(id='link3'))
# 获取文字内容
print(soup.get_text())
print(soup.select('.story'))
# .为类属性 #为 id属性
lxml底层实现为 C 速度比较快
xpath 是一门语言,在XML文档中查找信息的语言
学习网站 http://www.scrapyd.cn/doc/
from lxml import etree
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">sssss</p>
"""
selector = etree.HTML(html_doc)
links = selector.xpath('//p[@class="story"]/a/@href')
print(links)
r = requests.get('http://iguye.com/books.xml')
se = etree.HTML(r.text)
print(se.xpath('book'))
print(se.xpath('//book'))
# 选取书店下所有的书本的作者的名字
print(se.xpath('//bookstore/book/author/text()'))
# 选取书店下所有的书本的语言
print(se.xpath('//bookstore/book/title/@lang'))
# 选取书店下第一本书的标题
print(se.xpath('//bookstore/book[1]/title/text()'))
# 选取书店下最后一本书的标题
print(se.xpath('//bookstore/book[last()]/title/text()'))
# 选取书店下倒数第二本书的标题
print(se.xpath('//bookstore/book[last()-1]/title/text()'))
# 选取书店下前2本书的标题
print('前2本书',
se.xpath('//bookstore/book[position()<3]/title/text()'))
# 选取所有的分类为web的书本
print('分类为web的书本', se.xpath('//book[@category="web"]/title/text()'))
# 选取所有价格大于30.00元的书本
print('价格大于30.00元的书本', se.xpath('//book[price>35.00]/price/text()'))
# 选取所有class属性中包含book的书本的class属性
print('类名中包含book的书本', se.xpath('//book[contains(@class, "book")]/@class'))
import os
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
"AppleWebKit/537.36 (KHTML, like Gecko)" \
" Chrome/83.0.4103.97 Safari/537.36"
headers = {'User-Agent':ua}
r = requests.get('http://www.xiachufang.com/',headers=headers)
soup = BeautifulSoup(r.text,'html.parser')
img_list = []
for img in soup.select('img'):
if img.has_attr('data-src'):
img_list.append(img.attrs['data-src'])
else:
img_list.append(img.attrs['src'])
# 初始化下载文件目录
image_dir = os.path.join(os.getcwd(), 'images')
if not os.path.exists(image_dir):
os.mkdir(image_dir)
print(len(img_list))
for img in img_list[2:-1]:
o = urlparse(img)
print(o)
filename = o.path[1:].split('@')[0]
filepath = os.path.join(image_dir, filename)
if o.scheme != '':
url = '%s://%s/%s' % (o.scheme, o.netloc, filename)
print(url)
resp = requests.get(url)
with open(filepath, 'wb') as f:
for chunk in resp.iter_content(1024):
f.write(chunk)
import re
import os
from io import BytesIO
from urllib.parse import urlparse
from pycurl import Curl
buffer = BytesIO()
c = Curl()
c.setopt(c.URL, 'http://www.xiachufang.com/')
c.setopt(c.WRITEDATA, buffer)
c.perform()
c.close()
body = buffer.getvalue()
text = body.decode('utf-8')
print(text)
img_list = re.findall(r'src=\"(http://i2\.chuimg\.com/\w+\.jpg)',
text)
# 初始化下载文件目录
image_dir = os.path.join(os.curdir, 'images')
# if not os.path.isdir(image_dir):
# os.mkdir(image_dir)
for img in img_list[::-1]:
o = urlparse(img)
filename = o.path[1:]
filepath = os.path.join(image_dir, filename)
if not os.path.isdir(os.path.dirname(filepath)):
os.mkdir(os.path.dirname(filepath))
url = '%s://%s/%s' % (o.scheme, o.netloc, filename)
print(url)
with open(filepath, 'wb') as f:
c = Curl()
c.setopt(c.URL, url)
c.setopt(c.WRITEDATA, f)
c.perform()
c.close()
curl -s http://www.xiachufang.com | grep -oP '(?<=src=\")http://i2\.chuimg\.com/\w+\.jpg' | xargs -i curl --create-dir {} -o ./images/{}
多线程爬虫
import time
import threading
from queue import Queue
import requests
from lxml import etree
start_url = 'http://qianmu.iguye.com/2018USNEWS%E4%B8%96%E7%95%8C%E5%A4%A7%E5%AD%A6%E6%8E%92%E5%90%8D'
link_queue = Queue()
threads_num = 50
threads = []
download_pages = 0
def fetch(url):
"""请求并下载网页"""
r = requests.get(url)
if r.status_code != 200:
r.raise_for_status()
global download_pages
download_pages += 1
return r.text.replace('\t', '')
def parse_univerity(url):
"""处理大学详情页面"""
selector = etree.HTML(fetch(url))
data = {}
data['name'] = selector.xpath('//div[@id="wikiContent"]/h1/text()')[0]
table = selector.xpath(
'//div[@id="wikiContent"]/div[@class="infobox"]/table')
if table:
table = table[0]
keys = table.xpath('.//td[1]/p/text()')
cols = table.xpath('.//td[2]')
values = [' '.join(col.xpath('.//text()')) for col in cols]
if len(keys) != len(values):
return None
data.update(zip(keys, values))
return data
def process_data(data):
"""处理数据"""
if data:
print(data)
def download():
while True:
# 阻塞直到从队列里获取一条消息
link = link_queue.get()
if link is None:
break
data = parse_univerity(link)
process_data(data)
link_queue.task_done()
print('remaining queue: %s' % link_queue.qsize())
if __name__ == '__main__':
start_time = time.time()
# 1. 请求入口页面
selector = etree.HTML(fetch(start_url))
# 2. 提取列表页面的链接
links = selector.xpath('//div[@id="content"]//tr[position()>1]/td[2]/a/@href')
for link in links:
if not link.startswith('http://qianmu.iguye.com'):
link = 'http://qianmu.iguye.com/%s' % link
link_queue.put(link)
# 启动线程,并将线程对象放入一个列表保存
for i in range(threads_num):
t = threading.Thread(target=download)
t.start()
threads.append(t)
# 阻塞队列,直到队列被清空
link_queue.join()
# 向队列发送N个None,以通知线程退出
for i in range(threads_num):
link_queue.put(None)
# 退出线程
for t in threads:
t.join()
cost_seconds = time.time() - start_time
print('downloaded %s pages , cost %.2f seconds' %
(download_pages, cost_seconds))
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。