- 论坛徽章:
- 0
|
本帖最后由 dahe_1984 于 2020-04-21 16:49 编辑
python 抓取某网站js保护的网站
抓取某网站,发现登陆页面被js保护,前端显示如下图。图一
抓包如下图
遇到这种情况,最好还是用抓包软件解析下,用浏览器的调试模式,可能会错过一些信息,比如图中503和200的link是同一个,后面的200会覆盖前面的503.
抓取这种网站的主要思想就是获取503,302的cookies,然后用这两个cookies获取登陆页面。
1. 首先分析503页面,入参数为空,返回为图一
查看form如下:
<form id="challenge-form" action="/cdn-cgi/l/chk_jschl" method="get">
<input type="hidden" name="s" value="214700f32d10b867fdd49da8d985ea16bd6b29b7-1567475977-1800-AZPGfDwpe6n5W+bfR/JfkH0b7mHwbAhipRp5XUfgJyerOrD2AubKF7Aa8ERAXe98NRTTPCED3szad+VCfQ1ou2E9I3ku3zmS02TufI0Dn7GhR5T3M5N//h2XkNsnO7YwPA=="></input>
<input type="hidden" name="jschl_vc" value="b5f15504d03c61d3885516a63b0ddfa3"/>
<input type="hidden" name="pass" value="1567475981.464-zEwHsqZ/nE"/>
<input type="hidden" id="jschl-answer" name="jschl_answer"/>
</form>
其中s, jschl_vc, pass 为明文,通过模式匹配即可获得, 主要的难点是 jschl_answer,这个值是通过上面的javascript计算所得:
a.value = (+lEsmRuZ.MAC + t.length).toFixed(10);
lEsmRuZ.MAC 是个随机值,每次request返回的后面一堆“!![]+!![]+!![]+!![])+”,然后又进行了随机计算,之后加上t.length。t是通过对https://www.********.com/5xx-error-landing?utm_source=iuam一系列计算,每次这个link是不变的,所以t.length 是固定值
重新梳理下:jschl_answer = 随机值 + 固定值。
<!DOCTYPE HTML>
<html lang="en-US">
<head>
<meta charset="UTF-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=Edge,chrome=1" />
<meta name="robots" content="noindex, nofollow" />
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
<title>Just a moment...</title>
<script type="text/javascript">
//<![CDATA[
(function(){
var a = function() {try{return !!window.addEventListener} catch(e) {return !1} },
b = function(b, c) {a() ? document.addEventListener("DOMContentLoaded", b, c) : document.attachEvent("onreadystatechange", b)};
b(function(){
var a = document.getElementById('cf-content');a.style.display = 'block';
setTimeout(function(){
var s,t,o,p,b,r,e,a,k,i,n,g,f, lEsmRuZ={"MAC":+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![])+(+[])+(!+[]+!![]+!![]))/+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![])+(+[])+(!+[]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]))};
g = String.fromCharCode;
o = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
e = function(s) {
s += "==".slice(2 - (s.length & 3));
var bm, r = "", r1, r2, i = 0;
for (; i < s.length {
bm = o.indexOf(s.charAt(i++)) << 18 | o.indexOf(s.charAt(i++)) << 12
| (r1 = o.indexOf(s.charAt(i++))) << 6 | (r2 = o.indexOf(s.charAt(i++)));
r += r1 === 64 ? g(bm >> 16 & 255)
: r2 === 64 ? g(bm >> 16 & 255, bm >> 8 & 255)
: g(bm >> 16 & 255, bm >> 8 & 255, bm & 255);
}
return r;
};
t = document.createElement('div');
t.innerHTML="<a href='/'>x</a>";
t = t.firstChild.href;r = t.match(/https?:\/\//)[0];
t = t.substr(r.length); t = t.substr(0,t.length-1);
a = document.getElementById('jschl-answer');
f = document.getElementById('challenge-form');
;lEsmRuZ.MAC*=+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![])+(+[])+(!+[]+!![]+!![]+!![])+(+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![])+(!+[]+!![]+!![]))/+((!+[]+!![]+!![]+!![]+!![]+[])+(!+[]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![])+(+[]));lEsmRuZ.MAC-=+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![])+(+[])+(!+[]+!![]+!![]))/+((!+[]+!![]+!![]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![])+(+[])+(!+[]+!![]+!![])+(!+[]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]));lEsmRuZ.MAC-=+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(+[])+(!+[]+!![]+!![]+!![])+(+!![])+(!+[]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![])+(+!![]))/+((!+[]+!![]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![])+(!+[]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]));lEsmRuZ.MAC-=+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![])+(+[])+(+[])+(!+[]+!![]+!![])+(!+[]+!![]+!![]+!![])+(!+[]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]))/+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![])+(+[])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]));lEsmRuZ.MAC+=+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![])+(+[])+(!+[]+!![]+!![]+!![])+(+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![])+(!+[]+!![]+!![]))/+((+!![]+[])+(+!![])+(!+[]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(+[])+(!+[]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![]));lEsmRuZ.MAC-=+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(+[])+(!+[]+!![]+!![]+!![])+(+!![])+(!+[]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![])+(+!![]))/+((!+[]+!![]+[])+(!+[]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(+[])+(!+[]+!![]+!![])+(+[])+(!+[]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]));lEsmRuZ.MAC+=+((!+[]+!![]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![])+(+[])+(+[])+(!+[]+!![]+!![])+(!+[]+!![]+!![]+!![])+(!+[]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]))/+((!+[]+!![]+!![]+!![]+!![]+!![]+[])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![])+(!+[]+!![]+!![]+!![]+!![]+!![]+!![]+!![])+(!+[]+!![])+(!+[]+!![])+(!+[]+!![]+!![]));a.value = (+lEsmRuZ.MAC + t.length).toFixed(10); '; 121'
f.action += location.hash;
f.submit();
}, 4000);
}, false);
})();
//]]>
</script>
</head>
<body>
<table width="100%" height="100%" cellpadding="20">
<tr>
<td align="center" valign="middle">
<div class="cf-browser-verification cf-im-under-attack">
<noscript>
<h1 data-translate="turn_on_js" style="color:#bd2426;"> lease turn JavaScript on and reload the
page.</h1>
</noscript>
<div id="cf-content" style="display:none">
<div>
<div class="bubbles"></div>
<div class="bubbles"></div>
<div class="bubbles"></div>
</div>
<h1><span data-translate="checking_browser">Checking your browser before accessing</span>
*********.com.</h1>
<p data-translate="process_is_automatic">This process is automatic. Your browser will redirect
to your requested content shortly.</p>
<p data-translate="allow_5_secs"> lease allow up to 5 seconds…</p>
</div>
<form id="challenge-form" action="/cdn-cgi/l/chk_jschl" method="get">
<input type="hidden" name="s" value="554f9b8fba5518276e47f02c4e1276f3c3340ea3-1567480928-1800-Aft/VQ1MXXkT2xWDd7GHcFY32dF+cxnhubNnrmES8CcnPpf0mhfRJklvnS0PU6UplsC/zyBUC2eY0F9nBTSMqNinVFkJZognbEcr8SYVvW8TUDvm4xjnZA7POjQAkkTUTQ=="></input>
<input type="hidden" name="jschl_vc" value="fc7e0fa2603515f1c4b871d263e50984"/>
<input type="hidden" name="pass" value="1567480932.916-NRwiPjePzW"/>
<input type="hidden" id="jschl-answer" name="jschl_answer"/>
</form>
</div>
<div class="attribution">
<a href="https://www.**********.com/5xx-error-landing?utm_source=iuam" target="_blank"
style="font-size: 12px;">DDoS protection by ******</a>
<br>
Ray ID: 5104201b18a7952d
</div>
</td>
</tr>
</table>
</body>
</html>
a. 搞定固定值
如果javascript很好,当然小菜一碟,我这种js很渣的,算不清楚。那就先给t.length 为0,然后将随机值用execjs计算,反推t.length的值
import execjs
def get_js():
f = open("./test_answer.js", 'r', encoding='UTF-8')
line = f.readline()
htmlstr = ''
while line:
htmlstr = htmlstr + line
line = f.readline()
return htmlstr
login_js = s.get(login_503, headers=headers, allow_redirects=False)
print("cookies:", login_js.cookies.get_dict())
with open('./test_answer.js', 'w') as f:
# 用回车分割,把一堆天文 !+[] 等抓取出来,然后将 t.length 初始为 0
for tmp in login_js.text.split('\n'):
if ("!" in tmp and "+" in tmp and "[" in tmp and "]" in tmp):
f.write(tmp.replace('a.value', 'a_value').replace("t.length", "0" )
f.write('\n')
f.write(" return a_value\n"
jsstr = get_js()
ctx = execjs.compile(jsstr)
jschl_answer = ctx.call('enString')
print("jschl_answer:", ctx.call('enString'))
jschl_answer:20.0220718604
而通过抓包得到的是jschl_answer: 34.0220718604, 明显t.length为14
s:
554f9b8fba5518276e47f02c4e1276f3c3340ea3-1567480928-1800-Aft/VQ1MXXkT2xWDd7GHcFY32dF+cxnhubNnrmES8CcnPpf0mhfRJklvnS0PU6UplsC/zyBUC2eY0F9nBTSMqNinVFkJZognbEcr8SYVvW8TUDvm4xjnZA7POjQAkkTUTQ==
jschl_vc: fc7e0fa2603515f1c4b871d263e50984
pass: 1567480932.916-NRwiPjePzW
jschl_answer: 34.0220718604
b. 第二个大坑
还是上面的JavaScript,setTimeout 后面有个4000.
计算 jschl_answer 之后,在发送https://www.********.com/cdn-cgi/l/chk_jschl?s=554f9b8fba5518276e47f02c4e1276f3c3340ea3-1567480928-1800-Aft/VQ1MXXkT2xWDd7GHcFY32dF+cxnhubNnrmES8CcnPpf0mhfRJklvnS0PU6UplsC/zyBUC2eY0F9nBTSMqNinVFkJZognbEcr8SYVvW8TUDvm4xjnZA7POjQAkkTUTQ==&jschl_vc=fc7e0fa2603515f1c4b871d263e50984&pass=1567480932.916-NRwiPjePzW&jschl_answer=34.0220718604 之前必须sleep 4秒
c. 登陆的输入框还有个csrf跨域
登陆页面分成了两份,一个是页面页脚,另一个js文件(显示登陆用户名,密码)
csrf的token隐藏在页面页脚中,如图
最后总结下, 不断获得cookies
cookies: {'__cfduid': 'd40554f153bac94777d004e7f4f5a0bb91567482143'}
jschl_answer: 30.0195545700
first cookies: {'__cfduid': 'd40554f153bac94777d004e7f4f5a0bb91567482143'} https://www.**********.com/cdn-cgi/l/chk_jschl?s=1a089f63470949db9d75581c7a8315f88daf6c4f-1567482143-1800-AaJtjv%2BX%2F4pBwXRAnSjILK42Tt2ZnTT%2FmLBnY%2FGmaficjW6BfkFHxQBsgxHxbNYThpw3nDTtdWW2fo2U1zVrTLWa%2B3Uv0%2BCqpxASgPkHvTPj8PBtaW8VRVxXCy190KviGw%3D%3D&jschl_vc=60e7a8deca9ccda53a25735326b48f02&pass=1567482147.894-DTZ2MlONTf&jschl_answer=30.0195545700
second cookies: {'__cfduid': 'd40554f153bac94777d004e7f4f5a0bb91567482143', 'cf_clearance': '473ccd578f991fb92abadc5fe0ee301cb2e0f3d8-1567482148-1800-150'} 302
second cookies: {'__cfduid': 'd40554f153bac94777d004e7f4f5a0bb91567482143', 'cf_clearance': '473ccd578f991fb92abadc5fe0ee301cb2e0f3d8-1567482148-1800-150', 'PHPSESSID': '40mr2nm41a4i09sqqbd4e0tdr4', 'alive': '1'} 200
_csrf_token TismOkW2dfaen9GL-8aBl4cWhelNiY3602-ZuCkR2HM
The status of Login 302 {'__cfduid': 'd40554f153bac94777d004e7f4f5a0bb91567482143', 'cf_clearance': '473ccd578f991fb92abadc5fe0ee301cb2e0f3d8-1567482148-1800-150', 'PHPSESSID': 'nehh90nbqo9r7n55mc5ars1cr2', 'alive': '1'}
200 <RequestsCookieJar[<Cookie alive=1 for www.*********.com/>]> https://www.*********.com/en/market/primary/list.xlsx {'lender_groups[]': 43, 'sort_field': 'interest', 'sort_order': 'DESC', 'max_results': 300}
|
|