Thread Web Scraping
(12 answers)
Opened by kimmy at 2018-10-30 12:38
Hallo.
Ich versuche ein Script zu schreiben, das einen vollständigen Seitenquelltext herunterladet. Code (perl): (dl
)
1 2 3 4 5 6 7 8 9 10 11 12 13 #!/usr/bin/perl -w use strict; use WWW::Mechanize; my $mech = WWW::Mechanize->new(); my $response = $mech->get('https://www.digikey.com/products/en?vendor=0&keywords=BAS16W'); if ($response->is_success) { print $mech->content; } else { die $response->status_line; } Aber dann bekomme ich nur folgende Source: <!DOCTYPE html><html><head><script>var i10cdone = (function() { function pingBeacon(msg) { var i10cimg = document.createElement('script'); i10cimg.src = '/i10c@p1/botox/file/nv-loaded.js?status=' + window.encodeURIComponent(msg); i10cimg.onload = function() { document.head.removeChild(i10cimg) }; i10cimg.onerror = function() { document.head.removeChild(i10cimg) }; document.head.appendChild(i10cimg) }; pingBeacon('loaded'); if (String(document.cookie).indexOf('i10c.bdddb=c2-f0103ZLNqAeI3BH6yYOfG7TZlRtCrMwqUo') >= 0) { document.cookie = 'i10c.bdddb=;path=/'; }; var error = ''; var i10cevthandler = window.addEventListener('error', function(e) { if (e && e.error && e.error.stack) { error = e.error.stack; } else if (e && e.message) { error = e.message; } else { error = 'unknown'; } }); return function() { window.removeEventListener('error', i10cevthandler); if (error) { pingBeacon('error-' + String(error).substring(0, 500)); document.cookie = 'i10c.bdddb=c2-f0103ZLNqAeI3BH6yYOfG7TZlRtCrMwqUo;path=/'; } }; })(); </script><script type=text/javascript data-config="%7B%7D" src="/i10c@p1/client/latest/auto/instart.js?i10c.opts=botox"></script><script>typeof i10cdone === 'function' && i10cdone();</script></head><body><script>setTimeout(function(){document.cookie="i10c.eac23=1";window.location.reload(true);},30);</script></body></html> Ich nehme an, die Seite lässt sich nicht crawlen, oder? Gibt es irgendwelche Möglichkeit, die Seite herunterladen kann? |