471 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			471 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
#!/usr/local/bin/perl
 | 
						|
# ==================================================================
 | 
						|
# Gossamer Links - enhanced directory management system
 | 
						|
#
 | 
						|
#   Website  : http://gossamer-threads.com/
 | 
						|
#   Support  : http://gossamer-threads.com/scripts/support/
 | 
						|
#   CVS Info : 087,071,086,086,085      
 | 
						|
#   Revision : $Id: nph-verify.cgi,v 1.45 2006/12/27 17:02:34 brewt Exp $
 | 
						|
# 
 | 
						|
# Copyright (c) 2001 Gossamer Threads Inc.  All Rights Reserved.
 | 
						|
# Redistribution in part or in whole strictly prohibited. Please
 | 
						|
# see LICENSE file for full details.
 | 
						|
# ==================================================================
 | 
						|
 | 
						|
use strict;
 | 
						|
use lib '/var/home/slowtwitch/slowtwitch.com/cgi-bin/articles/admin';
 | 
						|
use vars qw/$USE_HTML $TODAY $GOOD $BAD @CACHE $MAX_ID/;
 | 
						|
use Links qw/$IN $DB $CFG/;
 | 
						|
 | 
						|
$USE_HTML = exists $ENV{REQUEST_METHOD} ? 1 : 0;
 | 
						|
 | 
						|
$TODAY = get_date();
 | 
						|
 | 
						|
$| = 1;
 | 
						|
local $SIG{__DIE__} = \&Links::fatal;
 | 
						|
Links::init('/var/home/slowtwitch/slowtwitch.com/cgi-bin/articles/admin');
 | 
						|
Links::init_admin();
 | 
						|
 | 
						|
 | 
						|
main();
 | 
						|
 | 
						|
sub main {
 | 
						|
# -------------------------------------------------------------------
 | 
						|
 | 
						|
# get the option parameters
 | 
						|
    my $params  = 0;
 | 
						|
    my $id      = 0;
 | 
						|
    my ($method);
 | 
						|
    if ($USE_HTML) {
 | 
						|
 | 
						|
# If there is a command line argument while in USE_HTML mode that means that
 | 
						|
# this is a child.
 | 
						|
        if (@ARGV == 2 or $ENV{verifier_lock_fpath}) {
 | 
						|
            my $lock_fpath = $ENV{verifier_lock_fpath}
 | 
						|
                || $ARGV[1]; # want the second parameter as the first should be --child
 | 
						|
            delete $SIG{__DIE__};
 | 
						|
            child($lock_fpath);
 | 
						|
            return;
 | 
						|
        }
 | 
						|
 | 
						|
# Otherwise, this is the parent
 | 
						|
        print $IN->header(-nph => $CFG->{nph_headers});
 | 
						|
        $_ = $IN->param("do");
 | 
						|
    }
 | 
						|
    else {
 | 
						|
        $_ = join " ", @ARGV;
 | 
						|
# convert command line params to parseable commands
 | 
						|
        if    (/--check[_-]all/)              { ($_, $method) = ("check_links", 5) }
 | 
						|
        elsif (/--check[_-]from\s+(\d+)/)     { ($_, $method, $params) = ("check_links", 1, $1) }
 | 
						|
        elsif (/--check[_-]new/)              { ($_, $method) = ("check_links", 4) }
 | 
						|
        elsif (/--check[_-]problem/)          { ($_, $method) = ("check_links", 3) }
 | 
						|
        elsif (/--check[_-]status\s+(-?\d+)/) { ($_, $method, $params) = ("check_links", 6, $1) }
 | 
						|
        elsif (/--check\s*(-?\d+)/)           { ($_, $method, $id) = ("check_links", 7, $1) }
 | 
						|
        elsif (/--fix[_-]302/)                { $_ = "fix_302" }
 | 
						|
        elsif (/--child\s*(.+)/)              { child($1); return } # $1 should contain lock fpath
 | 
						|
        else                                  { $_ = undef }
 | 
						|
    }
 | 
						|
 | 
						|
# If no input just display the information screen
 | 
						|
    if (not defined) {
 | 
						|
        return $USE_HTML ? Links::admin_page('tools_verify.html') : command_line_help();
 | 
						|
    }
 | 
						|
 | 
						|
    $CFG->{verify_max_children} = $IN->param('verify_max_children') || $CFG->{verify_max_children} || 3;
 | 
						|
    $CFG->{verify_chunk} = $IN->param('verify_chunk') || $CFG->{verify_chunk} || 10;
 | 
						|
    $CFG->save;
 | 
						|
 | 
						|
# Otherwise, try to fulfill the request
 | 
						|
    if ($USE_HTML) {
 | 
						|
        if (defined $IN->param("method")) {
 | 
						|
            check_links(
 | 
						|
                scalar $IN->param("method") || 5,
 | 
						|
                scalar $IN->param("status") || 0,
 | 
						|
                scalar $IN->param("ID")     || 0,
 | 
						|
                scalar $IN->param("days")   || 0,
 | 
						|
                scalar $IN->param("since"),
 | 
						|
                scalar $IN->param("to")
 | 
						|
            );
 | 
						|
        }
 | 
						|
        else {
 | 
						|
            return Links::admin_page('tools_verify.html');
 | 
						|
        }
 | 
						|
    }
 | 
						|
    else {
 | 
						|
        check_links($method, $params, $id, $params);
 | 
						|
    }
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
sub command_line_help {
 | 
						|
# -------------------------------------------------------------------
 | 
						|
# Print out a usage summary.
 | 
						|
#
 | 
						|
    print <<'HELP';
 | 
						|
This script checks the Gossamer Links database for link integrity.
 | 
						|
 | 
						|
The following parameters may be used from the command line:
 | 
						|
 | 
						|
For checking links: (one of)
 | 
						|
        --check-from number_of_days_ago
 | 
						|
        --check-problem
 | 
						|
        --check-new
 | 
						|
        --check-status status_code
 | 
						|
        --check-all
 | 
						|
 | 
						|
For fixing links:
 | 
						|
        --fix-302
 | 
						|
 | 
						|
HELP
 | 
						|
};
 | 
						|
 | 
						|
sub child {
 | 
						|
# -------------------------------------------------------------------
 | 
						|
    my $temp_fpath = shift;
 | 
						|
 | 
						|
    require Symbol;
 | 
						|
    my $fh = Symbol->gensym;
 | 
						|
    my $counter = 1;
 | 
						|
    my $entry_size = 4;
 | 
						|
 | 
						|
    require Links::Tools;
 | 
						|
 | 
						|
    while (1) {
 | 
						|
        while (-f "$temp_fpath.wait") {
 | 
						|
            if ($counter++ == 50) {
 | 
						|
                open $fh, "<$temp_fpath.wait" or warn "While trying to read '$temp_fpath.wait' got error: '$!'";
 | 
						|
                my $pid = int <$fh>;
 | 
						|
                close $fh;
 | 
						|
 | 
						|
                if (kill 0, $pid) {
 | 
						|
                    warn "$pid has locked id file for long time. Will continue testing for another 50 interations then give up.";
 | 
						|
                }
 | 
						|
                else {
 | 
						|
                    warn "$pid has locked id file but it seems to have abnormally terminated. Removing .wait file and proceeding.";
 | 
						|
                    unlink "$temp_fpath.wait";
 | 
						|
                }
 | 
						|
            }
 | 
						|
            elsif ($counter > 100) {
 | 
						|
                warn "$$ stopping run since pid file is locked for too long.";
 | 
						|
                return;
 | 
						|
            }
 | 
						|
            select undef, undef, undef, 0.5;
 | 
						|
        }
 | 
						|
 | 
						|
        my $old_sig = $SIG{INT};
 | 
						|
        $SIG{INT} = sub {
 | 
						|
            unlink "$temp_fpath.wait";
 | 
						|
            if ($old_sig) {
 | 
						|
                goto &$old_sig;
 | 
						|
            }
 | 
						|
            else {
 | 
						|
                exit;
 | 
						|
            }
 | 
						|
        };
 | 
						|
 | 
						|
        open $fh, "> $temp_fpath.wait";
 | 
						|
        print $fh $$;
 | 
						|
        close $fh;
 | 
						|
 | 
						|
        my $index = 0;
 | 
						|
        if (-f "$temp_fpath.ndx") {
 | 
						|
            open $fh, "< $temp_fpath.ndx";
 | 
						|
            $index = int <$fh>;
 | 
						|
            close $fh;
 | 
						|
        }
 | 
						|
 | 
						|
# offset into proper location in the file and get all the entries that need
 | 
						|
# testing
 | 
						|
        my $file_index = $index * $entry_size;
 | 
						|
 | 
						|
        if ($file_index > -s $temp_fpath) {
 | 
						|
            unlink "$temp_fpath.wait";
 | 
						|
            exit;
 | 
						|
        }
 | 
						|
 | 
						|
        my $entry_buf = '';
 | 
						|
 | 
						|
        open $fh, "< $temp_fpath";
 | 
						|
        sysseek $fh, $file_index, 0;
 | 
						|
        sysread $fh, $entry_buf, $entry_size * $CFG->{verify_chunk};
 | 
						|
        close $fh;
 | 
						|
 | 
						|
# now we can write the new index into the .ndx file for subsequence child
 | 
						|
# requests
 | 
						|
        open $fh, "> $temp_fpath.ndx";
 | 
						|
        $index += $CFG->{verify_chunk};
 | 
						|
        print $fh $index;
 | 
						|
        close $fh;
 | 
						|
 | 
						|
# finally all actions have been completed here so we can release our lock on
 | 
						|
# the data file and let other children access the index
 | 
						|
        unlink "$temp_fpath.wait";
 | 
						|
        $SIG{INT} = $old_sig || '';
 | 
						|
 | 
						|
# Need to convert all the ids that we fetched into usable workunits
 | 
						|
        my @work_unit = unpack "l*", $entry_buf;
 | 
						|
 | 
						|
# exit when there are no more work units 
 | 
						|
        unless (@work_unit) {
 | 
						|
            warn "Child Done!";
 | 
						|
            return;
 | 
						|
        }
 | 
						|
 | 
						|
        my $results = Links::Tools::check_links(@work_unit);
 | 
						|
        commit_results($results);
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
sub check_links {
 | 
						|
#----------------------------------------------------------------------
 | 
						|
# generates the sql query that nabs the link subset that we want
 | 
						|
# to check then checks the links
 | 
						|
#
 | 
						|
    my ($action, $status, $id, $days, $since, $to) = @_;
 | 
						|
 | 
						|
# make sure everything is what we expect it to be
 | 
						|
    $status = int $status;
 | 
						|
    $id     = int $id;
 | 
						|
    $days   = int $days;
 | 
						|
    $since ||= '';
 | 
						|
    $since   =~ m|(\d{4}/\d\d?/\d\d?)|;
 | 
						|
    $since   = $1;
 | 
						|
    $to    ||= '';
 | 
						|
    $to      =~ m|(\d{4}/\d\d?/\d\d?)|;
 | 
						|
    $to      = $1;
 | 
						|
 | 
						|
# build the query condition.
 | 
						|
    require GT::SQL::Condition;
 | 
						|
    my $cond = new GT::SQL::Condition;
 | 
						|
    if ($action == 1) {
 | 
						|
        # not checked in the last N days
 | 
						|
        my $tmp_date = get_date(time - (86400 * $days));
 | 
						|
        $cond->add("Date_Checked", "<", $tmp_date);
 | 
						|
    }
 | 
						|
    elsif ($action == 2) {
 | 
						|
        # checked last between N and O
 | 
						|
        $cond->add("Date_Checked", ">", $since);
 | 
						|
        $cond->add("Date_Checked", "<", $to);
 | 
						|
    }
 | 
						|
    elsif ($action == 3) {
 | 
						|
        # problem links
 | 
						|
        require Links::Tools;
 | 
						|
        $cond->add(Status => [keys %Links::Tools::STATUS_BAD]);
 | 
						|
    }
 | 
						|
    elsif ($action == 4) {
 | 
						|
        # new links
 | 
						|
        $cond->add("Status", "=", 0);
 | 
						|
    }
 | 
						|
    elsif ($action == 5) {
 | 
						|
        # everything
 | 
						|
        $cond = {};
 | 
						|
    }
 | 
						|
    elsif ($action == 6) {
 | 
						|
        # check something based on status code
 | 
						|
        $cond->add("Status", "=", $status);
 | 
						|
    }
 | 
						|
    elsif ($action == 7) {
 | 
						|
        # check a certain link
 | 
						|
        $cond->add("ID", "=", $id);
 | 
						|
    }
 | 
						|
 | 
						|
# find out how many items need to be checked...,
 | 
						|
    my $link_db = $DB->table('Links');
 | 
						|
    my $count = $link_db->count($cond) || 0;
 | 
						|
 | 
						|
    if ($USE_HTML) {
 | 
						|
        print <<'        HTML';
 | 
						|
<html>
 | 
						|
<head>
 | 
						|
<title>Checking Links</title>
 | 
						|
<body bgcolor="white">
 | 
						|
        HTML
 | 
						|
        print Links::header('Checking Links ...', 'Gossamer Links is now attempting to check your links, please be patient, this can take a while.', 0);
 | 
						|
        print '<pre>';
 | 
						|
    }
 | 
						|
    else {
 | 
						|
        print "Checking $count links...\n\n";
 | 
						|
    }
 | 
						|
 | 
						|
    my $start_time = time;
 | 
						|
 | 
						|
    unless ($count) {
 | 
						|
        print "No links to check!\n\n";
 | 
						|
    }
 | 
						|
    else {
 | 
						|
 | 
						|
# Get all the links we're going to check.
 | 
						|
# Done here so that we get the results before we call GT::TempFile to avoid any
 | 
						|
# concurrency issues.
 | 
						|
        my $link_sth = $DB->table('Links')->select(ID => $cond);
 | 
						|
 | 
						|
# Figure out where our new tempfile will live.
 | 
						|
        require GT::TempFile;
 | 
						|
        require Symbol;
 | 
						|
        my $temp_file = GT::TempFile->new(destroy => 0);
 | 
						|
        my $temp_fpath = $$temp_file;
 | 
						|
 | 
						|
# First, fetch all the links to be checked and place them into a temp file for
 | 
						|
# fast retrieval by children.
 | 
						|
        my $temp_fh = Symbol::gensym();
 | 
						|
        my $entry_size = 4; # need this for lookups
 | 
						|
        open $temp_fh, "> $temp_fpath";
 | 
						|
        while (my $id = $link_sth->fetchrow) {
 | 
						|
            print $temp_fh pack("l", $id);
 | 
						|
        }
 | 
						|
        close $temp_fh;
 | 
						|
 | 
						|
# Now, we can launch all the children that will start grabbing links
 | 
						|
 | 
						|
        require Links::Tools;
 | 
						|
        require GT::IPC::Run;
 | 
						|
        require GT::IPC::Filter::Line;
 | 
						|
 | 
						|
# Get links for the child to work upon. This functions by making sure the .wait
 | 
						|
# file is not available or at least random time based checks until it is.  when
 | 
						|
# it does finish, it will update the ndx file, which contains that index up to
 | 
						|
# the last item that was accessed then remove the .wait file it had just
 | 
						|
# created and continue with the checking of the fetched records.  Once the
 | 
						|
# .wait file has been removed, other concurrently running children can then
 | 
						|
# access the ndx
 | 
						|
#
 | 
						|
# We launch a new process instead of forking because...
 | 
						|
#
 | 
						|
#   Verifier seems to lock up after a failure that throws debug output.
 | 
						|
#       Problematic especially under mod_perl
 | 
						|
#   ODBC DBI handles are not shareable between threads without a clone.
 | 
						|
#
 | 
						|
# Ideally, the following line could have been used:
 | 
						|
# $child_function = sub { child( $temp_fpath ) };
 | 
						|
#
 | 
						|
        my $child_function = "$CFG->{path_to_perl} $CFG->{admin_root_path}/nph-verify.cgi  --child $temp_fpath";
 | 
						|
 | 
						|
        my $line_function = sub {
 | 
						|
        # --------------------------------------------------
 | 
						|
            my $line = shift;
 | 
						|
            my %values;
 | 
						|
            my ($id, $status, $url) = split /\t/, $line;
 | 
						|
 | 
						|
            return unless $id and $status;
 | 
						|
            $url ||= 'Missing URL';
 | 
						|
 | 
						|
            $USE_HTML ?
 | 
						|
                print qq|Checked <a href="$url" target='_blank'>$id</a> - $url - | :
 | 
						|
                print "$id\t$url\t";
 | 
						|
 | 
						|
            if ($Links::Tools::STATUS_OK{$status}) {
 | 
						|
                $GOOD++;
 | 
						|
                print "Success ($status). Message: $Links::Tools::STATUS_OK{$status}";
 | 
						|
            }
 | 
						|
            else {
 | 
						|
                $BAD++;
 | 
						|
                print "Request Failed (" . ($status || "unresolvable") . ")";
 | 
						|
                if ($status and $Links::Tools::STATUS_BAD{$status}) {
 | 
						|
                    print " Message: $Links::Tools::STATUS_BAD{$status}";
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            print "\n";
 | 
						|
        };
 | 
						|
 | 
						|
        my $ofilter = GT::IPC::Filter::Line->new($line_function);
 | 
						|
 | 
						|
# Do the part that launches all the children.
 | 
						|
        my $ipc = GT::IPC::Run->new;
 | 
						|
 | 
						|
        for (1 .. $CFG->{verify_max_children}) {
 | 
						|
            $ipc->start(
 | 
						|
                stdout  => $ofilter,
 | 
						|
                program => $child_function
 | 
						|
            ) or die $ipc->error;
 | 
						|
            print scalar(localtime) . " New child started\n";
 | 
						|
        }
 | 
						|
 | 
						|
        print "Finished launching children\n";
 | 
						|
 | 
						|
# Setup a signal handler on INT just in case we get an abnormal stop.
 | 
						|
        local $SIG{INT} = sub {
 | 
						|
            print "Unlinking temp files.\n";
 | 
						|
            unlink $temp_fpath;
 | 
						|
            unlink "$temp_fpath.wait";
 | 
						|
            unlink "$temp_fpath.ndx";
 | 
						|
            display_stats($start_time);
 | 
						|
            exit;
 | 
						|
        };
 | 
						|
 | 
						|
# Iterate until all the children have finished processing.
 | 
						|
        $ipc->do_loop;
 | 
						|
    }
 | 
						|
 | 
						|
    display_stats($start_time);
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
sub display_stats {
 | 
						|
# --------------------------------------------------
 | 
						|
# And can now print stats on how the checking went
 | 
						|
# Triggered by an INT signal or on normal termination
 | 
						|
# of the script.
 | 
						|
#
 | 
						|
    my $start_time = shift;
 | 
						|
    my $end_time = time;
 | 
						|
    my $run_time = $end_time - $start_time;
 | 
						|
 | 
						|
    {
 | 
						|
        last unless $run_time;
 | 
						|
 | 
						|
        print "\n\n";
 | 
						|
        print "Total Run Time: $run_time second(s)\n";
 | 
						|
 | 
						|
        $GOOD ||= 0;
 | 
						|
        $BAD  ||= 0;
 | 
						|
        my $total_links = $GOOD + $BAD or last;
 | 
						|
 | 
						|
        print "Total Links checked: $total_links\n";
 | 
						|
 | 
						|
        print "Total Links Bad: $BAD\n";
 | 
						|
        print "Total Links Good: $GOOD\n\n";
 | 
						|
        printf "Average time to check one link: %0.2fs\n", $run_time/$total_links;
 | 
						|
        printf "Average links checked in a second: %0.2f\n", $total_links/$run_time;
 | 
						|
    }
 | 
						|
 | 
						|
    print "</pre></body></html>\n\n" if $USE_HTML;
 | 
						|
}
 | 
						|
 | 
						|
sub commit_results {
 | 
						|
# --------------------------------------------------
 | 
						|
# Used by a child. This takes a hashref keyed by LinkID
 | 
						|
# mapping to HTTP status and stores the results into
 | 
						|
# the local database
 | 
						|
#
 | 
						|
    my $results = shift or return;
 | 
						|
 | 
						|
    my $link_db = $DB->table('Links');
 | 
						|
    my $ver_db = $DB->table('Verify');
 | 
						|
 | 
						|
    for my $id (keys %$results) {
 | 
						|
        my $status = $results->{$id};
 | 
						|
        next unless $status;
 | 
						|
        my $t = localtime;
 | 
						|
 | 
						|
        $ver_db->add({
 | 
						|
            LinkID       => $id,
 | 
						|
            Status       => $status,
 | 
						|
            Date_Checked => $TODAY
 | 
						|
        }) or warn "nph-verify.cgi: error adding to Verify table ($id): $GT::SQL::error";
 | 
						|
 | 
						|
        $link_db->update({ Status => $status, Date_Checked => $TODAY }, { ID => $id }) or
 | 
						|
            warn "nph-verify.cgi: error updating status ($id): $GT::SQL::error";
 | 
						|
    }
 | 
						|
}
 | 
						|
 | 
						|
sub get_date {
 | 
						|
# --------------------------------------------------------
 | 
						|
# Private method to translate a unix time value into a date.
 | 
						|
#
 | 
						|
    my $time = shift || time;
 | 
						|
    $time = time if $time =~ /\D/;
 | 
						|
    my ($sec, $min, $hour, $day, $mon, $year) = localtime $time;
 | 
						|
    return sprintf "%04d-%02d-%02d", $year + 1900, $mon + 1, $day;
 | 
						|
}
 |