当前位置:Gxlcms >
数据库问题 >
python爬取google搜索结果,配合sqlmap做sql注入检测
python爬取google搜索结果,配合sqlmap做sql注入检测
时间:2021-07-01 10:21:17
帮助过:7人阅读
gevent
import monkey;monkey.patch_all()
from bs4
import BeautifulSoup
import chardet
import os
import urllib.parse
import re
import random
from demo
import fenpei
import configparser
import requests
import gevent
import re
class Google_Disk(object):
def __init__(self):
self.conf=
{}
self.iplist=
[]
self.calc=
0
self.html=b
‘‘
self.cookies=
{}
self.cookie=
‘HSID=AnOingRydX5d2psm6; SSID=ADt9T-YUVJhcGL4qL; APISID=wJEaAiaIyzvEaudB/AcoN5lpzTLnX5Reo_; SAPISID=f7PURACCKCHWwSNN/AzvNr8jk9DaahBOjn; CONSENT=YES+CN.zh-CN+20170611-09-0; SID=BQd-7E64xr8N2KPkSozUAhhUGA1yC2pOm44rxZeltI5oyZczMhTQXcaLdnFMy6KuYM7CVQ.; _ga=GA1.1.1066659943.1561908462; _gcl_au=1.1.1103150496.1563265661; ANID=AHWqTUkF83QBPYbfQq0kmzf1KcFRM9zsr6E6DzhE_HothF5Y28xI_VdxHrB1fMar; SEARCH_SAMESITE=CgQIzY0B; GOOGLE_ABUSE_EXEMPTION=ID=becbf893a4904d44:TM=1566184449:C=r:IP=47.75.69.236-:S=APGng0se1h0QgE8PglXBZJi1H6W3jRYdzw; NID=188=I04uuKTsGOjSp5c3G9QzFnfHqsL7ZQE3t9FdHLq25aPPiAHLfdWBsh3j3v14esoRRMVNXV6Pg8WXsqliJ8c7G46efNs-16lEr8ZZn6Fvz0GzYcw6wzcJ78OWUOuiz0K8W63M0zuBNTUDDmzVBxiud788TjTvbI5CZurTIcD6z2TTwQ_TuoGvjP2cuutFWcs5C8_11nk35jERGC2_A2UPda-AtI2mnVspSF5NNpawFUwW8PgQpxM; DV=oylrE6tRiwhOECBuCtWvdH13M-J_yhYIrTZO_A7m2wIAAABsoyqeic4gCwEAAFj9N_RUZyHkUQAAAA; 1P_JAR=2019-8-19-3; SIDCC=AN0-TYtz7HmrYpB6Cyw9ogysPbuDr2AY0pBl89HytGxEBiBr2lsZ4ceFMNWkG4Efolz2ihLVoMth‘
for v
in self.cookie.split(
‘;‘):
key,value=v.split(
‘=‘,1
)
self.cookies[key]=
value
self.headers={
‘user-agent‘:
‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36‘}
self.host=
‘https://www.google.tw/search?‘
if os.path.exists(
‘Config.ini‘):
print(
‘[~] 读取配置文件‘)
config=
configparser.ConfigParser()
config.read("Config.ini",encoding=
‘utf-8‘)
self.conf[‘proxy‘]=config[
‘config‘][
‘proxy‘]
self.conf[‘save‘]=config[
‘config‘][
‘save_name‘]
self.conf[‘search‘]=config[
‘config‘][
‘search_grammar‘]
self.conf[‘page‘]=config[
‘config‘][
‘page‘]
self.conf[‘sleep‘]=config[
‘config‘][
‘sleep‘]
print(
‘[+] 读取完成‘)
else:
print(
‘[-] 找不到配置文件‘)
exit()
if os.path.exists(
‘iplist.txt‘):
print(
‘[~] 检测到iplist.txt,采用每次请求随机抽取一个IP‘)
dk=open(
‘iplist.txt‘,
‘r‘)
for r
in dk.readlines():
data=
"".join(r.split(
‘\n‘))
self.iplist.append(data)
proxy=
self.iplist
else:
proxy=self.conf[
‘proxy‘]
print(
‘[config] 代理设置:{}‘.format(proxy))
print(
‘[config] 搜索语法:{}‘.format(self.conf[
‘search‘]))
print(
‘[config] 抓取的页数:{}‘.format(self.conf[
‘page‘]))
print(
‘[config] 保存文件名:{}‘.format(self.conf[
‘save‘]))
def search(self):
for p
in range(0,int(self.conf[
‘page‘])):
page=p*10
if len(self.iplist)>
0:
proxy=
random.choice(self.iplist)
else:
proxy=self.conf[
‘proxy‘]
try:
html=fenpei(proxy=proxy,search=self.conf[
‘search‘], page=page,sleep=self.conf[
‘sleep‘])
if b
‘302 Moved‘ not in html:
#print(html)
self.html+=
html
else:
print(
‘[-] Google又要你输验证码啦...‘)
except Exception as r:
print(
"in search exception---------------------------------------------------------------------------------------------\n")
print(r)
def chuli(self):
try:
link_list = re.findall(r
"<div class=\"yuRUbf\"><a href=\"(.+?)\" ping=\"", str(self.html))
for url
in link_list:
print (url.replace(
‘&sa=U&‘,
‘‘).replace(
‘&‘,
‘‘))
print(url.replace(
‘&sa=U&‘,
‘‘).replace(
‘&‘,
‘‘),file=open(self.conf[
‘save‘],
‘a‘))
except:
pass
if __name__ ==
‘__main__‘:
obj=
Google_Disk()
obj.search()
obj.chuli()
demo.py: 常用的功能函数封装;这里有个关键点:一般情况下,我只检测国内的站点,所以在url需要添加lr=lang_zh-CN&tbs=lr%3Alang_1zh-CN参数,完整的url如下:
url=‘https://{}/search?lr=lang_zh-CN&tbs=lr%3Alang_1zh-CN&q={}&btnG=Search&gbv=10&start={}‘.format(domains,search,page)
import random
import requests
import time
from bs4 import BeautifulSoup
from http.cookiejar import LWPCookieJar
from urllib.request import Request, urlopen
from urllib.parse import quote_plus, urlparse, parse_qs
def read():
dk=open(‘user_agents.txt‘,‘r‘,encoding=‘utf-8‘)
for r in dk.readlines():
data="".join(r.split(‘\n‘))
yield data
def reads():
dk=open(‘domain.txt‘,‘r‘,encoding=‘utf-8‘)
for r in dk.readlines():
data="".join(r.split(‘\n‘))
yield data
def fenpei(proxy,search,page,sleep):
user_agents=[]
google_searchs=[]
for ua in read():
user_agents.append(ua)
for domain in reads():
google_searchs.append(domain)
time.sleep(int(sleep))
proxy={‘http‘:‘http://{}‘.format(proxy),‘https‘:‘https://{}‘.format(proxy)}
domains=random.choice(google_searchs)
u_s={‘user-agent‘:random.choice(user_agents),‘Content-type‘:"text/html;charset=utf-8"}
url=‘https://{}/search?lr=lang_zh-CN&tbs=lr%3Alang_1zh-CN&q={}&btnG=Search&gbv=10&start={}‘.format(domains,search,page)
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
rqt=requests.get(url=url,headers=u_s,allow_redirects=False,verify=False,proxies=proxy,timeout=30)
return rqt.content
所有文件如下:py文件就是上面的python脚本
- Config.ini:爬虫的配置文件,分别是代理服务器的ip和端口、爬取结果保存的文件、搜索的关键词、爬取google搜索结果的页面数、每次爬取间隔时间(防止被google要求验证)
[config]
proxy=127.0.0.1:12639
save_name=save.txt
search_grammar=inurl:php?id=
page=10
sleep=5
- domain.txt: google在全球各地的分站,可以多尝试不同的分站,避免被要求验证
- user-agent.txt:不同的操作系统、浏览器有不同的user-agent,这里可以来回切换,避免被要求验证;经过尝试,不同的user-agent会导致google返回不同的结果,给后续的url抽取带来困难,我这里暂时固化用这个:Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36
- iplist.txt:代理服务器的列表,格式为ip:portal
- save.txt:爬取结果保存的文档,都包含了我们自己设置的关键词,比百度精准太多了;
从sqlmap跑的结果来看,存在sql注入的url蛮多的(爬取的50个url中,检测出sql注入的有11个,比例已经超过了20%,保存在同目录的injection文件中),怪不得sql注入的漏洞在OWASP排名长期靠前(另一个是xss,近期用xray、awvs扫出来的高危漏洞超过一半都是xss)
python爬取google搜索结果,配合sqlmap做sql注入检测
标签:turn url 源码 eve choice 存在 就是 chrome orm