import aiohttp
from bs4 import BeautifulSoup
from xlrd import open_workbook
from xlwt import Workbook
url_list = [https://www.facebook.com,https://www.baidu.com,https://www.yahoo.com,...]
#There are more than 20000 different websites in the list
#Some websites may not be accessible
keywords=['xxx','xxx'....]
start = time.time()
localtime = time.asctime(time.localtime(time.time()))
print("start time :", localtime)
choose_url=[]
url_title=[]
async def get(url, session):
try:
async with session.get(url=url,timeout=0) as response:
resp = await response.text()
soup = BeautifulSoup(resp, "lxml")
title = soup.find("title").text.strip()
for keyword in keywords:
if keyword in title:
choose_url.append(url)
url_title.append(title)
print("Successfully got url {} with resp's name {}.".format(url, title))
break
except Exception as e:
pass
async def main(urls):
connector = aiohttp.TCPConnector(ssl=False,limit=0,limit_per_host =0)
session = aiohttp.ClientSession(connector=connector)
ret = await asyncio.gather(*[get(url, session) for url in urls])
print("Finalized all. Return is a list of outputs.")
await session.close()
def write_exccel(choose_url,url_title):
#write choose_url,url_title to excel
pass
asyncio.run(main(url_list))
write_exccel(choose_url,url_title)
localtime = time.asctime(time.localtime(time.time()))
print("now time is :", localtime)
end = time.time()
print('time used:', end - start)
我有 20000 个要请求的 URL。但是时间比较长(4、5个小时以上),如果我用requests+multiprocessing(Pool 4)的话3个小时就可以了。
我试过用aiohttp+multiprocessing,好像不行。通过优化此代码或使用任何可用技术,代码能否尽可能快?谢谢
最佳答案
不知道下面的方法快不快
import time
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils
class MySpider(Spider):
name = 'demo_spider'
start_urls = ["https://www.facebook.com","https://www.baidu.com","https://www.yahoo.com"] # Entry page
keywords = ['xxx','xxx']
choose_url=[]
url_title=[]
concurrencyPer1s = 10
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
title = doc.title
if title.containsOr(self.keywords):
self.choose_url.append(url.url)
self.url_title.append(title.text)
print("Successfully got url {} with resp's name {}.".format(url, title.text))
def urlCount(self):
count = Spider.urlCount(self)
if count==0:
SimplifiedMain.setRunFlag(False)
return count
start = time.time()
localtime = time.asctime(time.localtime(time.time()))
print("start time :", localtime)
SimplifiedMain.startThread(MySpider(),{"concurrency":600, "concurrencyPer1S":100, "intervalTime":0.001, "max_workers":10}) # Start download
localtime = time.asctime(time.localtime(time.time()))
print("now time is :", localtime)
end = time.time()
print('time used:', end - start)
https://stackoverflow.com/questions/62688012/
相关文章:
java - Bootstrap 类路径未与 -source 8 一起设置
jsf - Primefaces Dynaform 奇怪的行为与 f :validateLength
oracle - 如何使用 UTF8 作为国家字符集在 docker 中创建 Oracle 数据库?
python - 比较两个列表,如果相等则替换第三个列表中的值
typescript - TypeScript 中任意数量类型的非析取联合
azure - 如何使用 terraform 创建 azure 事件网格系统主题?
javascript - 在 Chrome 中将下/上填充应用于具有最小值和最大值的 "input