#!/usr/local/bin/perl # ================================================================== # Gossamer Links - enhanced directory management system # # Website : http://gossamer-threads.com/ # Support : http://gossamer-threads.com/scripts/support/ # CVS Info : 087,071,086,086,085 # Revision : $Id: nph-verify.cgi,v 1.45 2006/12/27 17:02:34 brewt Exp $ # # Copyright (c) 2001 Gossamer Threads Inc. All Rights Reserved. # Redistribution in part or in whole strictly prohibited. Please # see LICENSE file for full details. # ================================================================== use strict; use lib '/var/home/slowtwitch/slowtwitch.com/cgi-bin/articles/admin'; use vars qw/$USE_HTML $TODAY $GOOD $BAD @CACHE $MAX_ID/; use Links qw/$IN $DB $CFG/; $USE_HTML = exists $ENV{REQUEST_METHOD} ? 1 : 0; $TODAY = get_date(); $| = 1; local $SIG{__DIE__} = \&Links::fatal; Links::init('/var/home/slowtwitch/slowtwitch.com/cgi-bin/articles/admin'); Links::init_admin(); main(); sub main { # ------------------------------------------------------------------- # get the option parameters my $params = 0; my $id = 0; my ($method); if ($USE_HTML) { # If there is a command line argument while in USE_HTML mode that means that # this is a child. if (@ARGV == 2 or $ENV{verifier_lock_fpath}) { my $lock_fpath = $ENV{verifier_lock_fpath} || $ARGV[1]; # want the second parameter as the first should be --child delete $SIG{__DIE__}; child($lock_fpath); return; } # Otherwise, this is the parent print $IN->header(-nph => $CFG->{nph_headers}); $_ = $IN->param("do"); } else { $_ = join " ", @ARGV; # convert command line params to parseable commands if (/--check[_-]all/) { ($_, $method) = ("check_links", 5) } elsif (/--check[_-]from\s+(\d+)/) { ($_, $method, $params) = ("check_links", 1, $1) } elsif (/--check[_-]new/) { ($_, $method) = ("check_links", 4) } elsif (/--check[_-]problem/) { ($_, $method) = ("check_links", 3) } elsif (/--check[_-]status\s+(-?\d+)/) { ($_, $method, $params) = ("check_links", 6, $1) } elsif (/--check\s*(-?\d+)/) { ($_, $method, $id) = ("check_links", 7, $1) } elsif (/--fix[_-]302/) { $_ = "fix_302" } elsif (/--child\s*(.+)/) { child($1); return } # $1 should contain lock fpath else { $_ = undef } } # If no input just display the information screen if (not defined) { return $USE_HTML ? Links::admin_page('tools_verify.html') : command_line_help(); } $CFG->{verify_max_children} = $IN->param('verify_max_children') || $CFG->{verify_max_children} || 3; $CFG->{verify_chunk} = $IN->param('verify_chunk') || $CFG->{verify_chunk} || 10; $CFG->save; # Otherwise, try to fulfill the request if ($USE_HTML) { if (defined $IN->param("method")) { check_links( scalar $IN->param("method") || 5, scalar $IN->param("status") || 0, scalar $IN->param("ID") || 0, scalar $IN->param("days") || 0, scalar $IN->param("since"), scalar $IN->param("to") ); } else { return Links::admin_page('tools_verify.html'); } } else { check_links($method, $params, $id, $params); } } sub command_line_help { # ------------------------------------------------------------------- # Print out a usage summary. # print <<'HELP'; This script checks the Gossamer Links database for link integrity. The following parameters may be used from the command line: For checking links: (one of) --check-from number_of_days_ago --check-problem --check-new --check-status status_code --check-all For fixing links: --fix-302 HELP }; sub child { # ------------------------------------------------------------------- my $temp_fpath = shift; require Symbol; my $fh = Symbol->gensym; my $counter = 1; my $entry_size = 4; require Links::Tools; while (1) { while (-f "$temp_fpath.wait") { if ($counter++ == 50) { open $fh, "<$temp_fpath.wait" or warn "While trying to read '$temp_fpath.wait' got error: '$!'"; my $pid = int <$fh>; close $fh; if (kill 0, $pid) { warn "$pid has locked id file for long time. Will continue testing for another 50 interations then give up."; } else { warn "$pid has locked id file but it seems to have abnormally terminated. Removing .wait file and proceeding."; unlink "$temp_fpath.wait"; } } elsif ($counter > 100) { warn "$$ stopping run since pid file is locked for too long."; return; } select undef, undef, undef, 0.5; } my $old_sig = $SIG{INT}; $SIG{INT} = sub { unlink "$temp_fpath.wait"; if ($old_sig) { goto &$old_sig; } else { exit; } }; open $fh, "> $temp_fpath.wait"; print $fh $$; close $fh; my $index = 0; if (-f "$temp_fpath.ndx") { open $fh, "< $temp_fpath.ndx"; $index = int <$fh>; close $fh; } # offset into proper location in the file and get all the entries that need # testing my $file_index = $index * $entry_size; if ($file_index > -s $temp_fpath) { unlink "$temp_fpath.wait"; exit; } my $entry_buf = ''; open $fh, "< $temp_fpath"; sysseek $fh, $file_index, 0; sysread $fh, $entry_buf, $entry_size * $CFG->{verify_chunk}; close $fh; # now we can write the new index into the .ndx file for subsequence child # requests open $fh, "> $temp_fpath.ndx"; $index += $CFG->{verify_chunk}; print $fh $index; close $fh; # finally all actions have been completed here so we can release our lock on # the data file and let other children access the index unlink "$temp_fpath.wait"; $SIG{INT} = $old_sig || ''; # Need to convert all the ids that we fetched into usable workunits my @work_unit = unpack "l*", $entry_buf; # exit when there are no more work units unless (@work_unit) { warn "Child Done!"; return; } my $results = Links::Tools::check_links(@work_unit); commit_results($results); } } sub check_links { #---------------------------------------------------------------------- # generates the sql query that nabs the link subset that we want # to check then checks the links # my ($action, $status, $id, $days, $since, $to) = @_; # make sure everything is what we expect it to be $status = int $status; $id = int $id; $days = int $days; $since ||= ''; $since =~ m|(\d{4}/\d\d?/\d\d?)|; $since = $1; $to ||= ''; $to =~ m|(\d{4}/\d\d?/\d\d?)|; $to = $1; # build the query condition. require GT::SQL::Condition; my $cond = new GT::SQL::Condition; if ($action == 1) { # not checked in the last N days my $tmp_date = get_date(time - (86400 * $days)); $cond->add("Date_Checked", "<", $tmp_date); } elsif ($action == 2) { # checked last between N and O $cond->add("Date_Checked", ">", $since); $cond->add("Date_Checked", "<", $to); } elsif ($action == 3) { # problem links require Links::Tools; $cond->add(Status => [keys %Links::Tools::STATUS_BAD]); } elsif ($action == 4) { # new links $cond->add("Status", "=", 0); } elsif ($action == 5) { # everything $cond = {}; } elsif ($action == 6) { # check something based on status code $cond->add("Status", "=", $status); } elsif ($action == 7) { # check a certain link $cond->add("ID", "=", $id); } # find out how many items need to be checked..., my $link_db = $DB->table('Links'); my $count = $link_db->count($cond) || 0; if ($USE_HTML) { print <<' HTML'; Checking Links HTML print Links::header('Checking Links ...', 'Gossamer Links is now attempting to check your links, please be patient, this can take a while.', 0); print '
';
    }
    else {
        print "Checking $count links...\n\n";
    }

    my $start_time = time;

    unless ($count) {
        print "No links to check!\n\n";
    }
    else {

# Get all the links we're going to check.
# Done here so that we get the results before we call GT::TempFile to avoid any
# concurrency issues.
        my $link_sth = $DB->table('Links')->select(ID => $cond);

# Figure out where our new tempfile will live.
        require GT::TempFile;
        require Symbol;
        my $temp_file = GT::TempFile->new(destroy => 0);
        my $temp_fpath = $$temp_file;

# First, fetch all the links to be checked and place them into a temp file for
# fast retrieval by children.
        my $temp_fh = Symbol::gensym();
        my $entry_size = 4; # need this for lookups
        open $temp_fh, "> $temp_fpath";
        while (my $id = $link_sth->fetchrow) {
            print $temp_fh pack("l", $id);
        }
        close $temp_fh;

# Now, we can launch all the children that will start grabbing links

        require Links::Tools;
        require GT::IPC::Run;
        require GT::IPC::Filter::Line;

# Get links for the child to work upon. This functions by making sure the .wait
# file is not available or at least random time based checks until it is.  when
# it does finish, it will update the ndx file, which contains that index up to
# the last item that was accessed then remove the .wait file it had just
# created and continue with the checking of the fetched records.  Once the
# .wait file has been removed, other concurrently running children can then
# access the ndx
#
# We launch a new process instead of forking because...
#
#   Verifier seems to lock up after a failure that throws debug output.
#       Problematic especially under mod_perl
#   ODBC DBI handles are not shareable between threads without a clone.
#
# Ideally, the following line could have been used:
# $child_function = sub { child( $temp_fpath ) };
#
        my $child_function = "$CFG->{path_to_perl} $CFG->{admin_root_path}/nph-verify.cgi  --child $temp_fpath";

        my $line_function = sub {
        # --------------------------------------------------
            my $line = shift;
            my %values;
            my ($id, $status, $url) = split /\t/, $line;

            return unless $id and $status;
            $url ||= 'Missing URL';

            $USE_HTML ?
                print qq|Checked $id - $url - | :
                print "$id\t$url\t";

            if ($Links::Tools::STATUS_OK{$status}) {
                $GOOD++;
                print "Success ($status). Message: $Links::Tools::STATUS_OK{$status}";
            }
            else {
                $BAD++;
                print "Request Failed (" . ($status || "unresolvable") . ")";
                if ($status and $Links::Tools::STATUS_BAD{$status}) {
                    print " Message: $Links::Tools::STATUS_BAD{$status}";
                }
            }

            print "\n";
        };

        my $ofilter = GT::IPC::Filter::Line->new($line_function);

# Do the part that launches all the children.
        my $ipc = GT::IPC::Run->new;

        for (1 .. $CFG->{verify_max_children}) {
            $ipc->start(
                stdout  => $ofilter,
                program => $child_function
            ) or die $ipc->error;
            print scalar(localtime) . " New child started\n";
        }

        print "Finished launching children\n";

# Setup a signal handler on INT just in case we get an abnormal stop.
        local $SIG{INT} = sub {
            print "Unlinking temp files.\n";
            unlink $temp_fpath;
            unlink "$temp_fpath.wait";
            unlink "$temp_fpath.ndx";
            display_stats($start_time);
            exit;
        };

# Iterate until all the children have finished processing.
        $ipc->do_loop;
    }

    display_stats($start_time);

}

sub display_stats {
# --------------------------------------------------
# And can now print stats on how the checking went
# Triggered by an INT signal or on normal termination
# of the script.
#
    my $start_time = shift;
    my $end_time = time;
    my $run_time = $end_time - $start_time;

    {
        last unless $run_time;

        print "\n\n";
        print "Total Run Time: $run_time second(s)\n";

        $GOOD ||= 0;
        $BAD  ||= 0;
        my $total_links = $GOOD + $BAD or last;

        print "Total Links checked: $total_links\n";

        print "Total Links Bad: $BAD\n";
        print "Total Links Good: $GOOD\n\n";
        printf "Average time to check one link: %0.2fs\n", $run_time/$total_links;
        printf "Average links checked in a second: %0.2f\n", $total_links/$run_time;
    }

    print "
\n\n" if $USE_HTML; } sub commit_results { # -------------------------------------------------- # Used by a child. This takes a hashref keyed by LinkID # mapping to HTTP status and stores the results into # the local database # my $results = shift or return; my $link_db = $DB->table('Links'); my $ver_db = $DB->table('Verify'); for my $id (keys %$results) { my $status = $results->{$id}; next unless $status; my $t = localtime; $ver_db->add({ LinkID => $id, Status => $status, Date_Checked => $TODAY }) or warn "nph-verify.cgi: error adding to Verify table ($id): $GT::SQL::error"; $link_db->update({ Status => $status, Date_Checked => $TODAY }, { ID => $id }) or warn "nph-verify.cgi: error updating status ($id): $GT::SQL::error"; } } sub get_date { # -------------------------------------------------------- # Private method to translate a unix time value into a date. # my $time = shift || time; $time = time if $time =~ /\D/; my ($sec, $min, $hour, $day, $mon, $year) = localtime $time; return sprintf "%04d-%02d-%02d", $year + 1900, $mon + 1, $day; }