我创建了一个脚本,使用 python 结合 selenium 在其中实现代理,以登录到 facebook 并抓取其帖子位于我的 feed 顶部的用户的名称。我希望脚本每五分钟执行一次,时间不限。
由于这种连续登录可能会导致我的帐户被禁止,我想在脚本中实现代理以匿名完成所有操作。
到目前为止我已经写了:
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def get_first_user(random_proxy):
options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
options.add_experimental_option("prefs",prefs)
options.add_argument(f'--proxy-server={random_proxy}')
with webdriver.Chrome(options=options) as driver:
wait = WebDriverWait(driver,10)
driver.get("https://www.facebook.com/")
driver.find_element_by_id("email").send_keys("username")
driver.find_element_by_id("pass").send_keys("password",Keys.RETURN)
user = wait.until(EC.presence_of_element_located((By.XPATH,"//h4[@id][@class][./span[./a]]/span/a"))).text
return user
if __name__ == '__main__':
proxies = [`list of proxies`]
while True:
random_proxy = proxies.pop(random.randrange(len(proxies)))
print(get_first_user(random_proxy))
time.sleep(60000*5)
How to stay undetected while scraping data continuously from a site that requires authentication?
最佳答案
我不确定您为什么要每 5 分钟持续登录您的 Facebook
帐户来抓取内容。并且为每次登录使用随机代理地址可能会引发 红旗 Facebook
安全规则。
与其每 5 分钟登录一次 Facebook
,我建议保持登录状态。 Selenium
具有刷新由自动化控制的网页的功能。通过使用此方法,您可以刷新您的
Facebook
以预定义的时间间隔(例如 5 分钟)提要。
下面的代码使用这个刷新方法来重新加载页面。该代码还会检查您 Feed 顶部的用户帖子。
在测试中,我注意到 Facebook
使用了一些随机标记,这可能用于缓解抓取。我还注意到 Facebook
更改了链接到群组的帖子的用户名格式,因此如果您想要链接到这些帖子的用户名,则需要进行更多测试。我强烈建议进行更多测试以确定哪些用户元素没有被正确抓取。
from time import sleep
from random import randint
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-popup-blocking")
# disable the banner "Chrome is being controlled by automated test software"
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_experimental_option("excludeSwitches", ['enable-automation'])
# global driver
driver = webdriver.Chrome('/usr/local/bin/chromedriver', options=chrome_options)
driver.get('https://www.facebook.com')
driver.implicitly_wait(20)
driver.find_element_by_id("email").send_keys("your_username")
driver.find_element_by_id("pass").send_keys("your_password")
driver.implicitly_wait(10)
driver.find_element_by_xpath(("//button[text()='Log In']")).click()
# this function checks for a standard username tag
def user_element_exist():
try:
if driver.find_element_by_xpath("//h4[@id][@class][./span[./a]]/span/a"):
return True
except NoSuchElementException:
return False
# this function looks for username linked to Facebook Groups at the top of your feed
def group_element():
try:
if driver.find_element_by_xpath("//*[starts-with(@id, 'jsc_c_')]/span[1]/span/span/a/b"):
poster_name = driver.find_element_by_xpath("//*[starts-with(@id, 'jsc_c_')]/span[1]/span/span/a/b").text
return poster_name
if driver.find_element_by_xpath("//*[starts-with(@id, 'jsc_c_')]/strong[1]/span/a/span/span"):
poster_name = driver.find_element_by_xpath("//*[starts-with(@id, 'jsc_c_')]/strong["
"1]/span/a/span/span").text
return poster_name
except NoSuchElementException:
return "No user information found"
while True:
element_exists = user_element_exist()
if not element_exists:
user_name = group_element()
print(user_name)
driver.refresh()
elif element_exists:
user_name = driver.find_element_by_xpath("//h4[@id][@class][./span[./a]]/span/a").text
print(user_name)
driver.refresh()
# set the sleep timer to fit your needs
sleep(300) # This sleeps for 300 seconds, which is 5 minutes.
# I would likely use a random sleep function
# sleep(randint(180, 360))
https://stackoverflow.com/questions/68741214/