View file File name : DUCK.pm Content : # Copyright (C) 2016 Simon Kainz <skainz@debian.org> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # he Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # On Debian GNU/Linux systems, the complete text of the GNU General # Public License can be found in `/usr/share/common-licenses/GPL-2'. # # You should have received a copy of the GNU General Public License # along with this program. If not, you can find it on the World Wide # Web at https://www.gnu.org/copyleft/gpl.html, or write to the Free # Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, # MA 02110-1301, USA. use strict; use warnings; package DUCK; my $VERSION ='0.13'; my $COPYRIGHT_YEAR ='2017'; my $default_user_agent='Mozilla/5.0 (X11; Linux x86_64; rv:46.0) Gecko/20100101 Firefox/46.0'; use Regexp::Common qw /URI Email::Address/; use String::Similarity; use File::Which; use WWW::Curl::Easy; use strict; use IPC::Open3; use IO::Select; use Net::DNS; use Mail::Address; use Domain::PublicSuffix; use Config::Simple '-strict'; my $callbacks; my $urlfixes={ TRAILING_PAREN_DOT=>1, TRAILING_SLASH_DOT=>1, TRAILING_SLASH_PAREN=>1, TRAILING_COLON=>1, TRAILING_PUNCTUATION=>1, TRAILING_QUOTES=>1 }; my $self; my $helpers={ svn =>0, bzr =>0, git =>0, darcs =>1, # This works always as it uses WWW::Curl::Easy hg => 0, browser =>1 # This works always as we use WWW::Curl::Easy; }; my $website_moved_regexs; my $obsolete_sites; my $cli_options; my $tools= { git => { cmd => 'git', args => ['ls-remote','%URL%'] }, hg =>{ cmd => 'hg', args => ['id','%URL%'] }, bzr => { cmd => 'bzr', args => ['-Ossl.cert_reqs=none','log','%URL%'] }, svn => { cmd => 'svn', args => ['--non-interactive','--trust-server-cert','info','%URL%'] } }; sub version { return $VERSION; } sub copyright_year { return $COPYRIGHT_YEAR; } sub new { my $class = shift; $self = {}; bless $self, $class; $self->__find_helpers(); foreach (keys %$tools) { $tools->{$_}->{'args_count'}=scalar @{$tools->{$_}->{'args'}}; } my $config_file=find_config_file(); if($config_file) { my $nc=new Config::Simple($config_file); my $hashes=$nc->get_block("parked_domains"); $website_moved_regexs=$hashes->{'regexes'}; $hashes=$nc->get_block("obsolete_sites"); if ($hashes) { if ($hashes->{'regexes'}) { $obsolete_sites=$hashes->{'regexes'}; } } } return $self; } sub cb() { $callbacks= { "Vcs-Browser" =>\&browser, "Vcs-Darcs" =>\&darcs, "Vcs-Git" =>\&git, "Vcs-Hg" =>\&hg, "Vcs-Svn" =>\&svn, "Vcs-Bzr" =>\&bzr, "Homepage" => \&browser, "URL" => \&browser, "Email" => \&email, "Maintainer" => \&maintainer, "Uploaders" => \&uploaders, "Try-HTTPS" => \&try_https, "SVN" => \&svn }; return $callbacks; } sub setOptions() { shift; my ($ke,$va)=@_; $cli_options->{$ke}=$va; } sub getUrlFixes() { return keys %{$urlfixes}; } sub setUrlFixOptions() { shift; my ($ke,$va)=@_; if (!exists $urlfixes->{$ke}) { print "Unknown urlfix parameter: $ke\nAvailable options are:\n\t".join("\n\t",getUrlFixes())."\n"; exit 1; } $urlfixes->{$ke}=$va; } sub __find_helpers() { $helpers->{git}=1 unless !defined (which('git')); $helpers->{svn}=1 unless !defined (which('svn')); $helpers->{hg}=1 unless !defined (which('hg')); $helpers->{bzr}=1 unless !defined (which('bzr')); } sub getHelpers() { return $helpers; } sub git() { my ($url)=@_; my @urlparts=split(/\s+/,$url); if ($tools->{'git'}->{'args_count'}) { splice(@{$tools->{'git'}->{'args'}},$tools->{'git'}->{'args_count'}); } if ($urlparts[1]) { if ($urlparts[1] eq "-b" && $urlparts[2]) { push(@{$tools->{'git'}->{'args'}},'-b '.$urlparts[2]); } } return __run_helper('git',$urlparts[0]); } sub bzr() { my ($url)=@_; return __run_helper('bzr',$url); } sub hg() { my ($url)=@_; return __run_helper('hg',$url); } sub svn() { my ($url)=@_; $ENV{SVN_SSH}='ssh -o BatchMode=yes'; return __run_helper('svn',$url); } sub browser() { my $enforce=1; my ($url)=@_; $url =~ s/\.*$//g; { return __run_browser($url); } } sub try_https_new($;$) { my $similarity_th=0.9; my ($url,$erghttp)=@_; $url =~ s/\.*$//g; # print STDOUT "tryhttps: $url, ergshttp:".ref($erghttp)."\n"; my $res; #my $erghttp; # my $ergshttp= __run_browser($url,0); # print Dumper $ergshttp; # print "thttps:".ref($erghttp)."\n"; # if (scalar($ergshttp)) # { # $erghttp=@$ergshttp[0]; # if (@$erghttp[0]->{'retval'} >0) {return $erghttp;} # } # if ($erghttp->{'retval'} >0 ) {return $erghttp;} my $secure_url= $url; $secure_url=~ s/http:/https:/g; my $ergshttps= __run_browser($secure_url); my $erghttps=@$ergshttps[0]; if ($erghttps->{'retval'} >0 ) { # error with https, so do not suggest switching to https, report only http check results return $erghttp; } # otherwise check similarity, and report if pages are (quite) the same if ($erghttps && $erghttps->{'retval'} == 0 && $erghttp && $erghttp->{'body'} && $erghttp->{'finalscheme'} eq "http") { # https worked, now try to find out if pages match # print "#####################1\n".$erghttp->{'body'}."\n\n".$erghttps->{'body'}."\n\n"; my $similarity= similarity $erghttp->{'body'}, $erghttps->{'body'}; if ($similarity > $similarity_th) { $res->{'retval'}=2; $res->{'response'}="The web page at $url works, but is also available via $secure_url, please consider switching to HTTPS urls."; return $res; } } else { # report nothing $res->{'retval'}=0; return $res; } $res->{'retval'}=0; $res->{'response'}="lolz"; $res->{'url'}=$url; return $res; } sub try_https() { my $similarity_th=0.9; my ($url)=@_; $url =~ s/\.*$//g; my $res; my $erghttp= __run_browser($url); if ($erghttp->{'retval'} >0 ) {return $erghttp;} my $secure_url= $url; $secure_url=~ s/http:/https:/g; my $erghttps= __run_browser($secure_url); if ($erghttps->{'retval'} >0 ) { # error with https, so do not suggest switching to https, report only http check results return $erghttp; } # otherwise check similarity, and report if pages are (quite) the same if ($erghttps->{'retval'} == 0) { # https worked, now try to find out if pages match my $similarity= similarity $erghttp->{'body'}, $erghttps->{'body'}; if ($similarity > $similarity_th) { $res->{'retval'}=2; $res->{'response'}="The web page at $url works, but is also available via $secure_url, please consider switching to HTTPS urls."; return $res; } } else { # report nothing $res->{'retval'}=0; return $res; } $res->{'retval'}=0; $res->{'response'}="lolz"; $res->{'url'}=$url; return $res; } sub darcs() { my ($url)=@_; my $darcsurltemp=$url; $darcsurltemp =~ s/\/$//; $darcsurltemp.='/_darcs/hashed_inventory'; return __run_browser($darcsurltemp); } sub uploaders() { my ($line_uploaders)=@_; $line_uploaders =~ s/\n/ /g; my @emails; if ($line_uploaders =~ /@/) { @emails=Mail::Address->parse($line_uploaders); } my $res; foreach my $email(@emails) { my $es=$email->address(); my $ra=check_domain($es); my $r=@$ra[0]; if ($r->{retval}>0) { if (!$res->{retval}) { $res=$r; } else { $res->{retval}=$r->{retval}; $res->{response}.="\n".$r->{response}; $res->{url}="foo"; } } } if (!$res->{retval}) { $res->{'retval'}=0; $res->{'response'}=""; $res->{'url'}=$line_uploaders; } return $res; } sub maintainer() { my ($email)=@_; return check_domain($email); } sub email() { my ($email) =@_; return check_domain($email); } sub __run_browser { my $certainty; my @SSLs=(CURL_SSLVERSION_DEFAULT, CURL_SSLVERSION_TLSv1, CURL_SSLVERSION_SSLv2, CURL_SSLVERSION_SSLv3, CURL_SSLVERSION_TLSv1_0, CURL_SSLVERSION_TLSv1_1, CURL_SSLVERSION_TLSv1_2); my ($url,$return_ref)=@_; my $user_agent=$default_user_agent; if (! ( $cli_options->{'no-https'})) { $cli_options->{'no-https'}=0; } if (! ( $cli_options->{'no-check-certificate'})) { $cli_options->{'no-check-certificate'}=0; } #check if URL is mailto: link if ($url =~/mailto:\s*.+@.+/) { return check_domain($url); } my $curl = WWW::Curl::Easy->new; my @website_moved_whitelist=('anonscm.debian.org.*duck.git'); $curl->setopt(CURLOPT_HEADER,0); if ($cli_options->{'no-check-certificate'} eq 1) { $curl->setopt(CURLOPT_SSL_VERIFYPEER,0); $curl->setopt(CURLOPT_SSL_VERIFYHOST,0); } $curl->setopt(CURLOPT_CERTINFO,0); $curl->setopt(CURLOPT_FOLLOWLOCATION,1); $curl->setopt(CURLOPT_SSL_CIPHER_LIST,'ALL'); $curl->setopt(CURLOPT_MAXREDIRS,10); $curl->setopt(CURLOPT_TIMEOUT,60); $curl->setopt(CURLOPT_USERAGENT,$user_agent); $curl->setopt(CURLOPT_URL, $url); #validate against ca-certificates # $curl->setopt(CURLOPT_SSL_VERIFYPEER,1); # $curl->setopt(CURLOPT_SSL_VERIFYHOST,1); # $curl->setopt(CURLOPT_CAPATH,''); my $response_body; my $response_code; my $retcode; my $response; my $startscheme; my $finalscheme; my $startdomain; my $finaldomain; my $startdomainsuffix; my $finaldomainsuffix; if ($url =~ /($RE{URI}{HTTP}{-keep}{-scheme =>'https?'})/ ) { $startdomain=$4; $startscheme=$3; } foreach my $s (@SSLs) { $curl->setopt(CURLOPT_WRITEDATA,\$response_body); $curl->setopt(CURLOPT_SSLVERSION,$s); # Starts the actual request $retcode = $curl->perform; $response_code = $curl->getinfo(CURLINFO_HTTP_CODE); $response=$curl->strerror($retcode)." ".$curl->errbuf."\n"; if ($curl->getinfo(CURLINFO_EFFECTIVE_URL) =~ /($RE{URI}{HTTP}{-keep}{-scheme =>'https?'})/ ) { $finaldomain=$4; $finalscheme=$3; } $startdomainsuffix = Domain::PublicSuffix->new({'data_file' => '/usr/share/publicsuffix/effective_tld_names.dat'}); $finaldomainsuffix = Domain::PublicSuffix->new({'data_file' => '/usr/share/publicsuffix/effective_tld_names.dat'}); if ($retcode == 35) { next;} if ($retcode == 56) {next;} last; } # Looking at the results... my $status=0; my $disp=0; my $is_obsolete=0; if ($retcode == 0) # no curl error, but maybe a http error { #default to error $status=1; $disp=1; #handle ok cases, 200 is ok for sure if ($response_code ==200 ) { $status=0; $disp=0; } if ($response_code ==226 ) { $status=0; $disp=0; } if ($response_code ==227 ) { $status=0; $disp=0; } if ($response_code ==302 ) #temporary redirect is ok { $status=0; $disp=0; } if ($response_code ==403) { ## special case for sourceforge.net sites ## sourceforge seems to always return correct pages wit http code 40. if ( $url =~ m/(sourceforge|sf).net/i) { $status=0; $disp=0; } } my $whitelisted=0; foreach my $whitelist_url (@website_moved_whitelist) { if ( $url =~ m/$whitelist_url/i) {$whitelisted=1;} } foreach my $obsolete_site (@$obsolete_sites) { if ($url =~ m/$obsolete_site/i) { $is_obsolete=1; } } if ($whitelisted == 0 && $response_body) { $response_body=strip_html_comments($response_body); foreach my $regex (@$website_moved_regexs) { if ($response_body =~ m/$regex/i ) { $disp=2; $response.="Website seems to be outdated, is probably a parked domain or for sale. Please update your links!\nMatching regular expression: m/".$regex."/i"; $certainty="wild-guess"; last; } } } } else { # we have a curl error, so we show this entry for sure $status=1; $disp=1; } my $ret; $ret->{'retval'}=$disp; $ret->{'response'}="Curl:$retcode HTTP:$response_code $response"; $ret->{'url'}=$url; $ret->{'body'}=$response_body; $ret->{'certainty'}=$certainty; $ret->{'startscheme'}=$startscheme; $ret->{'startdomain'}=$startdomain; $ret->{'finalscheme'}=$finalscheme; $ret->{'finaldomain'}=$finaldomain; my @reta; push(@reta,$ret); if ($startscheme && $finalscheme) { my $rd_startdomainsuffix=$startdomainsuffix->get_root_domain($startdomain); my $rd_finaldomainsuffix=$startdomainsuffix->get_root_domain($finaldomain); if ($rd_startdomainsuffix && $rd_finaldomainsuffix && ($rd_startdomainsuffix ne $rd_finaldomainsuffix)) { my $ret_dom; $ret_dom->{'retval'}=2; $ret_dom->{'response'}="Domain redirect detected: ${startscheme}://".$startdomain." -> ${finalscheme}://".$finaldomain.". Probably a new upstream website?"; $ret_dom->{'url'}=$url; $ret_dom->{'certainty'}=$certainty; $ret_dom->{'startscheme'}=$startscheme; $ret_dom->{'startdomain'}=$startdomain; $ret_dom->{'finalscheme'}=$finalscheme; $ret_dom->{'finaldomain'}=$finaldomain; push (@reta,$ret_dom); } else { if ($is_obsolete==1) { my $ret_obsolete; $ret_obsolete->{'retval'}=2; $ret_obsolete->{'response'}="The website/URL is known to be obsolete. Please update your links."; $ret_obsolete->{'url'}=$url; $ret_obsolete->{'certainty'}="wild-guess"; push (@reta,$ret_obsolete); } if ($startscheme eq "https" and $finalscheme eq "http") { my $ret_schema; $ret_schema->{'retval'}=2; $ret_schema->{'response'}="Secure URL redirects to an insecure URL: ${startscheme}://$startdomain -> ${finalscheme}://${finaldomain}"; $ret_schema->{'url'}=$url; # $ret->{'body'}=$; $ret_schema->{'certainty'}=$certainty; $ret_schema->{'startscheme'}=$startscheme; $ret_schema->{'startdomain'}=$startdomain; $ret_schema->{'finalscheme'}=$finalscheme; $ret_schema->{'finaldomain'}=$finaldomain; push (@reta,$ret_schema); } if ($startscheme eq "http" and $finalscheme eq "https") { my $ret_schema; $ret_schema->{'retval'}=2; $ret_schema->{'response'}="URL schema changed from HTTP to HTTPS during redirect(s): ${startscheme}://$startdomain -> ${finalscheme}://${finaldomain}\nPlease investigate and update the URL eventually, to avoid unneccesary redirects!"; $ret_schema->{'url'}=$url; $ret_schema->{'certainty'}=$certainty; $ret_schema->{'startscheme'}=$startscheme; $ret_schema->{'startdomain'}=$startdomain; $ret_schema->{'finalscheme'}=$finalscheme; $ret_schema->{'finaldomain'}=$finaldomain; push (@reta,$ret_schema); } } } if ( ($ret->{'retval'} ==0) && defined($startscheme) && (!($startscheme eq "https"))) { if ( ($cli_options->{'no-https'}==0) && (!($url =~ m/https:\/\//i )) ) { my $https=try_https_new($url,$ret); if ($https->{'retval'} > 0 ) { push(@reta,$https); } } } return \@reta; } sub find_config_file() { (my $conffilename)=@_; if (!defined $conffilename) { $conffilename="duck/duck.conf"; } my @config_dirs; if (!$ENV{'XDG_CONFIG_DIRS'}) { push(@config_dirs,"/etc/xdg"); } else { push(@config_dirs,split(/:/,$ENV{'XDG_CONFIG_DIRS'})); } push (@config_dirs,"/etc"); if (!$ENV{'XDG_CONFIG_HOME'}) { push(@config_dirs,$ENV{'HOME'}."/.config"); } else { push(@config_dirs,$ENV{'XDG_CONFIG_HOME'}); } foreach my $cdir (reverse @config_dirs) { my $fp=$cdir.'/'.$conffilename; if ( -r $fp) {return $fp;} } return 0; } sub __run_helper { my ($tool,$url)=@_; return undef unless $helpers->{$tool} == 1; return undef unless defined $tools->{$tool}; my @args=@{$tools->{$tool}->{'args'}}; for(@args){s/\%URL\%/$url/g} my $pid; my $command; my $timeout; my @reta; if ($cli_options->{'timeout'}) { my $timeout_value=60; if ( ( $cli_options->{'timeout_seconds'} )) { $timeout_value=$cli_options->{'timeout_seconds'}; $timeout_value =~ s/[^0-9]//; } unshift @args,$tools->{$tool}->{'cmd'}; unshift @args,$timeout_value."s"; $command="/usr/bin/timeout"; $pid=open3(\*WRITE,\*READ,0,$command,@args); } else { $pid=open3(\*WRITE,\*READ,0,$tools->{$tool}->{'cmd'},@args); } my @results = <READ>; waitpid ($pid,0); close READ; my $retval=$?; my $ret; $ret->{'retval'}=$retval; $ret->{'response'}=join("",@results); $ret->{'url'}=$url; push(@reta,$ret); return \@reta; } sub check_domain($) { my $res = Net::DNS::Resolver->new; my ($email) = @_; my @emails=Mail::Address->parse($email); $email=$emails[0]->address(); # $email=$email->address(); my @domain = ( $email =~ m/^[^@]*@([^?^&^>]*).*/); my @queries=('MX','A','AAAA'); my @results; my $iserror=1; foreach my $query (@queries) { my $q=$res->query($domain[0],$query); if ($q) { my @answers=$q->answer; my $mxcount=scalar @answers; push (@results,$mxcount." ".$query." entries found."); $iserror=0; last; } else { push (@results,"$email: No ".$query." entry found."); } } my $ret; my @reta; $ret->{'retval'}=$iserror; $ret->{'response'}=join("\n",@results); $ret->{'url'}=$email; push(@reta,$ret); return \@reta; } sub strip_html_comments() { my ($html)=@_; my $pid=open3(\*WRITE,\*READ,\*ERR,'lynx -cfg=/dev/null -dump -width=2048 -stdin') or die("lynx not found!"); print WRITE $html; close WRITE; my @cleaned=<READ>; waitpid($pid,0); close(READ); return join("",@cleaned); } sub extract_url($) { my $url; my ($b,$l)=@_; if ( $l =~ /\(\s*($RE{URI}{-keep}{-scheme =>'https?'})\s*\)/ || $l =~ /($RE{URI}{HTTP}{-keep}{-scheme =>'https?'})/ || $l =~ /\(\s*($RE{URI}{HTTP}{-keep}{-scheme =>'ftp'})\s*\)/ || $l =~ /($RE{URI}{HTTP}{-keep}{-scheme =>'ftp'})/ ) { #ok, we have a url here, now clean it up: my $url=$1; if ($urlfixes->{TRAILING_PAREN_DOT}) { $url =~ s/\)\.\s*$//g; } if ($urlfixes->{TRAILING_SLASH_DOT}) { $url =~ s/\/\.\s*$/\//g; } if ($urlfixes->{TRAILING_SLASH_PAREN}) { $url =~ s/\/\)\s*$/\//g; } if ($urlfixes->{TRAILING_COLON}) { $url =~ s/:\s*$//g; } if ($urlfixes->{TRAILING_PUNCTUATION}) { $url =~ s/([^\/])[\.,]\s*$/$1/g; } if ($urlfixes->{TRAILING_QUOTES}) { $url =~ s/'\s*$//g; } return $url; } }; 1;