#!/usr/bin/perl
use strict;
#
my $VERSION = "0.3 beta";
# 2004-08-25 - Fixing some Getopts things. Trying to get multiple-dld to work
# 2004-08-24 - Push URL, when they seem to be clips (Clips can also have large/medium/small links)
# Added Getopts - get help with "--help"
# 2004-08-23 - Started putting appletrailer and appletrailers together
# 2004-08-18 - Fix for Alfie
# http://www.apple.com/trailers/paramount/alfie/
# TWO Embed Tags (Full Screen Button and Trailer)
# _fs.mov (not fs1.mov)
# 2004-08-12 - Started Modification to get all Trailers from
# http://www.apple.com/trailers
# -- Version 0.1 --
# 2004-03-16 - First Version
# Takes an URL as parameter ( like the .../large.html
# or .../moviename/) and finds the trailer to download
# This Script gets the latest Apple Trailers
# Created by Leonard Tulipan (micattack@gmx.net)
# Homepage: http://downlets.sourceforge.net/
# Released under the GPL - This is free software
# Needs #
# HTML::TreeBuilder
# Bundle::LWP
# Term::ProgressBar
# File::Basename
# To install you probably just need to execute:
# perl -e "use CPAN; install HTML::TreeBuilder; install Bundle::LWP;"
# perl -e "use CPAN; install Term::ProgressBar; install File::Basename;"
# Answer NO if cpan asks if you want to configure manually. This doesn't
# select your local mirror, but it also doesn't bother you with a lot of
# questions
# Todo #
# Use getopts and e.g.:
# -a ... All Trailers (Now Default)
# -m ... Movie Name (partial regex matching)
# -l ... List (a)ll (default), (e)xclusive, (S)tudio
# -k ... Ask before downloading
# -p ... Prefered Version (small - medium - large) - currently always the largest
# -s ... Download from that Studio
# Set to 1 to see debug output
# Can now be set with --debug command line switch
my $debug=0;
# For immediate output of progress bar
$|++;
# HTML Treebuilder is needed for HTML File Syntax checking
use HTML::TreeBuilder;
# LWP UserAgent is used for downloading the HTML Files from the web
# make sure we have the modules we need, else die peacefully.
# perl -e "use CPAN; install Bundle::LWP;"
eval("use LWP 5.6.9;"); die "[err] LWP 5.6.9 or greater required.\n" if $@;
# Progress Bar
eval("use Term::ProgressBar;"); # prevent word-wrapping.
die "[err] Term::ProgressBar not installed.\n" if $@;
# For Command Line Options
use Getopt::Long;
# For dirname
use File::Basename;
my $version; my $help; my @url; my $ask; my $nobig;
GetOptions( 'version' => \$version, 'debug' => \$debug,
'help' => \$help, 'url=s' => \@url,
'ask|k' => \$ask, 'nobig' => \$nobig);
@url = split(/,/,join(',',@url));
if($version || $help)
{
print " Appletrailers Version $VERSION\n";
print " Created by Leonard Tulipan (micattack\@gmx.net)\n";
print " Homepage: http://downlets.sourceforge.net/\n";
print " Released under the GPL - This is free software\n";
if($help)
{
print <<EndHelp;
Help:
--version ... Print Version and exit (see it above, btw)
--help/-h ... This Help
--url/-u <URL> ... Download Trailer found on this URL (Specify the HTML not the MOV File
e.g: -f http://www.apple.com/trailers/paramount/team_america/
You can specify more than one url with a comma separator
or simply specify multiple -u on the command line
--ask/-k ... Ask before Downloading a Trailer
--debug/-d ... Turn Debugging On
EndHelp
}
exit;
}
print " Debugging is now on\n" if $debug;
# Start a new instance from UserAgent
my $ua = LWP::UserAgent->new;
# Set User Agent to iTunes
$ua->agent('User-Agent: iTunes/4.6 (Macintosh; U; PPC Mac OS X 10.3)');
# We save the trailerpages ../large.html or the like in this array
my @trailerpages;
if(!@url)
{ # Get All Trailers, if no url is specified on the command line
# Get the Trailer HTML Page
my $request = HTTP::Request->new('GET', "http://www.apple.com/trailers/");
# The Itunes xml URL:
#my $request = HTTP::Request->new('GET', "www.apple.com/moviesxml/h/index.xml");
# Save the HTML in $response
my $response = $ua->request($request);
# Start a new instance of TreeBuilder
my $tree = HTML::TreeBuilder->new();
# Parse the content of $response (Trailerpage)
$tree->parse($response->content);
# Print out the content, if there seems to be something wrong (new html code, etc)
#print $response->content;
# Get the Trailer-Rows - Look for <div class=rowtext> and put each of those as a value of an array
my @trailer_rows = $tree->look_down("_tag", "div", sub{ $_[0]->attr('class') eq "rowtext"});
# Look for the exclusive Trailers inside the first <div class=rowtext>
# Look for all <div class=link> Tags
my @exclusive = $trailer_rows[0]->look_down("_tag", "div", sub{ $_[0]->attr('class') eq "link"});
# Get all <a href> Tags and save those links in trailers array
my $link;
my @trailers;
for(my $cnt=0; $cnt < $#exclusive; $cnt++) {
$link = $exclusive[$cnt]->look_down("_tag", "a");
push @trailers, $link->attr('href');
}
# Get the Studio Trailer List
# This is the second <div class=rowtext> element
# Inside look for all <li> Tags
# And save the contained <a href> links into the trailers array
my @studio = $trailer_rows[1]->look_down("_tag", "li");
for(my $cnt=0; $cnt < $#studio; $cnt++) {
$link = $studio[$cnt]->look_down("_tag", "a");
push @trailers, $link->attr('href');
}
# Only get the Uniqe occurences (Trailes can be listed as exclusive AND studio trailers
# Thank you groups.google.com for that :-)
my @uniq;
my %uniq;
@uniq{@trailers} = ();
@trailers= sort keys %uniq;
# This would print out all Trailers, but that's too much debuging output
#foreach (@trailers) {
# print "Trailer: ". $_ ."\n" if $debug;
#}
print "Checking ". ($#trailers + 1) ." Trailers\n" if $debug;
foreach(@trailers)
{ # Check on all trailerpages
# Make absolute paths, so e.g:
# "sony/spiderman2" will be made into
# "/trailers/sony/spiderman2"
# But leave paths which already start with "/" intact
if(! /^\//)
{ $_ = "/trailers/" . $_; }
my $current = $_;
# Kill trailing Slash
my $currentdir = substr($current,0, length($current)-1);
if(!($current =~ /\/$/))
{ # Only use dirname if URL does not end with "/"
# This seems to be a bug in dirname, so we first fill it with the above value
# without trailing slash, and only use dirname, if the trailer does not have a
# trailing slash to begin with
$currentdir = dirname($current);
}
print "Looking at: ". $currentdir ."\n" if $debug;
# Get the Trailer HTML Page
my $request = HTTP::Request->new('GET', "http://www.apple.com". $_);
my $tr_resp = $ua->request($request);
# Start a new instance of TreeBuilder
my $tr_tree = HTML::TreeBuilder->new();
# Parse the content of $response (Trailerpage)
$tr_tree->parse($tr_resp->content);
# Sometimes we get the 404 Page with the title-tag, then we'd better not analyse that page
if($tr_tree->look_down("_tag", "title", sub{ $_[0]->as_text eq "Apple - Page Not Found"}))
{ print "404 Page\n" if $debug }
else {
# Get all links
my @links = $tr_tree->look_down("_tag", "a");
my $found = 0;
for(my $cnt=0;$cnt <= $#links;$cnt++) {
# Trailer links have a rollover, so onmouseout is set most of the time
if($links[$cnt]->attr('onmouseout')) {
# lg ... e.g: Alien Vs Predator
# large ... Incredibles
# bigger ...
# 480 ... Maria Full Of Grace
if($links[$cnt]->attr('href') =~ /(lg|large|high|bigger|480)/i && !($links[$cnt]->attr('href') =~ /^http/) ) {
$found = 1;
print "Pushing URL: ". $currentdir . "/" . $links[$cnt]->attr('href') ."\n" if $debug;
push @trailerpages, $currentdir . "/" . $links[$cnt]->attr('href');
}
}
# If no rollover is specified
if($links[$cnt]->attr('href') =~ /(lg|large|high|bigger|480)/i && !($links[$cnt]->attr('href') =~ /^http/) ) {
$found = 1;
print "Pushing URL: ". $currentdir . "/" . $links[$cnt]->attr('href') ."\n" if $debug;
push @trailerpages, $currentdir . "/" . $links[$cnt]->attr('href');
}
if( $links[$cnt]->attr('href') =~ /index/i && $found == 0) {
# Not a Trailer, don't do more, but still could be, see area test below
$found = 2;
#print "Probably not a Trailer\n" if $debug;
}
# For Text Featurettes and other offsite stuff (www.parentalguide.org - which matches "lg" test)
if(! $links[$cnt]->attr('href') =~ /^http/i && ! ($links[$cnt]->attr('href') =~ /^index/i) ) {
$found = 1;
print "Pushing URL: ". $currentdir . "/" . $links[$cnt]->attr('href') ."\n" if $debug;
push @trailerpages, $currentdir . "/" . $links[$cnt]->attr('href');
}
}
# We have an embed tag on the page, could be the trailer
if($tr_tree->look_down("_tag", "embed") && $found ne 1) {
# Is Probably alread the Trailer Page
print "Pushing URL: ". $currentdir . "/\n" if $debug;
push @trailerpages, $currentdir . "/";
$found = 1;
}
if($found ne 1) {
# Check area Links (Image Map)
my @area;
@area = $tr_tree->look_down("_tag", "area");
for(my $cnt=0;$cnt <= $#area;$cnt++) {
my $href = $area[$cnt]->attr('href');
if($href =~ /(lg|large|high|bigger|480)/i ) {
if( !($href =~ /^http/) )
{
$found = 1;
print "Pushing URL: ". $currentdir . "/" . $href . "\n" if $debug;
push @trailerpages, $currentdir . "/" . $href;
}
}
}
}
if($found eq 0) {
# print "------------------------------\nhttp://www.apple.com" . $current . "\n------------------------------\n" if $debug;
# print "NONE FOUND: \n" if $debug;
for(my $cnt=0;$cnt <= $#links;$cnt++) {
#print "-- ". $links[$cnt]->attr('href') . " --\n" if (not ($links[$cnt]->attr('href') =~ /^http/)) && $debug;
# Links without http could still be clips pages, so we add them to the trailers array to be tested in this loop we're currently in
if (! ($links[$cnt]->attr('href') =~ /^http/)) {
print "Adding to Trailers: " . $current . $links[$cnt]->attr('href') . "\n" if $debug;
push @trailers, $current . $links[$cnt]->attr('href');
}
}
}
}
}
} # end if(!url)
else {
# Just one url, check that one
print "Pushing ". ($#url+1) . " URLs\n" if $debug;
push @trailerpages, @url;
}
print "Getting Trailers from ". ($#trailerpages + 1) ." Trailerpages\n" if $debug;
# Make trailerpages unique
my @uniq;
my %uniq;
@uniq{@trailerpages} = ();
@trailerpages= sort keys %uniq;
print "Getting Trailers from ". ($#trailerpages + 1) ." Trailerpages\n" if $debug;
foreach (@trailerpages)
{
print "Getting: " if $debug;
print if $debug;
print "\n" if $debug;
if(! /^http/) { $_ = "http://www.apple.com" . $_; }
my $request;
my $response;
my $tree;
my @mov;
my $movuri;
my $test;
my $result;
my $remote_headers;
my $total_size;
my $fullmovuri;
my $progress;
# Get the Trailer HTML Page (large.html or the like)
$request = HTTP::Request->new('GET', $_);
# Save the HTML in $response
$response = $ua->request($request);
# Start a new instance of TreeBuilder
$tree = HTML::TreeBuilder->new();
# Parse the content of $response (Trailerpage)
$tree->parse($response->content);
#my $mov = $tree->look_down("_tag", "embed", sub{ $_[0]->attr('pluginspage') eq "http://www.apple.com/quicktime/download/indext.html"} , sub{ $_[0]->attr('type') eq 'video/quicktime'});
@mov = $tree->look_down("_tag", "embed", sub{ $_[0]->attr('pluginspage') eq "http://www.apple.com/quicktime/download/indext.html"});
if($#mov >= 0)
{
$movuri = $mov[$#mov]->attr('src');
$test = basename($movuri);
if ($test =~ /^qt[\/\-\_\d\w]*\.mov$/ || $movuri =~ /\/image/)
{ $movuri = $mov[$#mov]->attr('href'); }
#print $movuri . "\n";
$tree->delete();
#print $movuri;
$ua = LWP::UserAgent->new( );
$result = $ua->head($movuri);
$remote_headers = $result->headers;
$total_size = $remote_headers->content_length;
if($total_size > 1000000)
{ $fullmovuri = $movuri; }
else
{
# Get the MOV File
$request = HTTP::Request->new('GET', $movuri);
# Save the HTML in $response
$response = $ua->request($request);
#if($response->content =~ /\x20\x00\x00\x00(.*0\.mov)\x00/)
if($response->content =~ /\x20\x00\x00\x00.?([\/\-\_\d\w]*\.mov)\x00/)
{ $fullmovuri = $1;
# Just the Directory
$movuri = dirname($movuri);
# add a Slash if not with Filename
$movuri .= "/" if (!($fullmovuri =~ /^\//));
# make full uri with dirname / filename
$fullmovuri = $movuri . $fullmovuri;
}
#print "full: -$fullmovuri- \n";
}
if( $fullmovuri ne "")
{
my $final_data_length = undef;
my $progress; # progress bar object.
my $next_update = 0; # reduce ProgressBar use.
if(! $nobig)
{
if( $fullmovuri =~ /(.*)240\.mov$/)
{
my $bigtest = $1 . "480.mov";
print "Testing: $bigtest\n" if $debug;
my $ua = LWP::UserAgent->new( );
my $result = $ua->head($bigtest);
my $remote_headers = $result->headers;
$total_size = $remote_headers->content_length;
if($total_size > 1000000)
{ $fullmovuri = $bigtest }
}
# Itunes Full Screen
if( $fullmovuri =~ /(.*)m480\.mov$/)
{
my $bigtest = $1 . "ifs.mov";
print "Testing: $bigtest\n" if $debug;
my $ua = LWP::UserAgent->new( );
my $result = $ua->head($bigtest);
my $remote_headers = $result->headers;
$total_size = $remote_headers->content_length;
if($total_size > 1000000)
{ $fullmovuri = $bigtest }
}
# Standard Full Screen
if( $fullmovuri =~ /(.*)m480\.mov$/)
{
my $bigtest = $1 . "fs1.mov";
print "Testing: $bigtest\n" if $debug;
my $ua = LWP::UserAgent->new( );
my $result = $ua->head($bigtest);
my $remote_headers = $result->headers;
$total_size = $remote_headers->content_length;
if($total_size > 1000000)
{ $fullmovuri = $bigtest }
}
# Standard Full Screen
if( $fullmovuri =~ /(.*)m480\.mov$/)
{
my $bigtest = $1 . "fs.mov";
print "Testing: $bigtest\n" if $debug;
my $ua = LWP::UserAgent->new( );
my $result = $ua->head($bigtest);
my $remote_headers = $result->headers;
$total_size = $remote_headers->content_length;
if($total_size > 1000000)
{ $fullmovuri = $bigtest }
}
} # end ! $nobig
my $url = $fullmovuri;
if( -e basename($url)) {
print "Trailer ". substr(basename($url), 0, 40) ." already exists\n" if $debug;
} else {
print "Downloading $url ", substr(basename($url), 0, 40);
my $answer = "";
if($ask) {
print " $url OK? (Y/n): ";
$answer = <STDIN>;
chomp($answer);
}
print "\nAnswer: \"$answer\"\n" if $debug;
open (MOVFILE, "> " . basename($url)) || die $!;
if(! ($answer =~ /^n/i) ) {
# create a new useragent and download the actual URL.
# all the data gets thrown into $final_data, which
# the callback subroutine appends to.
#my $ua = LWP::UserAgent->new( );
$result = $ua->head($url);
$remote_headers = $result->headers;
$total_size = $remote_headers->content_length;
printf( " ( %.2fMB )... \n", $total_size/1024/1024 );
# initialize our progress bar.
$final_data_length = undef;
$next_update = 0;
print "\nf: " . $final_data_length . "\n";
$progress = Term::ProgressBar->new({count => $total_size, ETA => 'linear'});
$progress->minor(0); # turns off the floating asterisks.
$progress->max_update_rate(1); # only relevant when ETA is used.
$response = $ua->get($url, ':content_cb' => \&callback, );
# top off the progress bar.
$progress->update($total_size);
print "\nf: " . $final_data_length . "\n";
# per chunk.
sub callback {
my ($data, $response, $protocol) = @_;
print MOVFILE $data;
$final_data_length += length($data);
#print "\n" . $response. " " . length($data) . "\n";
# reduce usage, as per example 3 in POD.
$next_update = $progress->update($final_data_length)
if $final_data_length >= $next_update;
}
$progress = undef;
} else {
print "Empty File created\n";
}
close (MOVFILE);
}# $mov != 0
}
}
}