discourse-legacysite-perl/site/slowtwitch.com/cgi-bin/articles/admin/Links/Import/RDFS2.pm
2024-06-17 21:49:12 +10:00

534 lines
26 KiB
Perl

# ==================================================================
# Gossamer Links - enhanced directory management system
#
# Website : http://gossamer-threads.com/
# Support : http://gossamer-threads.com/scripts/support/
# CVS Info : 087,071,086,086,085
# Revision : $Id: RDFS2.pm,v 1.20 2005/04/07 19:34:41 brewt Exp $
#
# Copyright (c) 2001 Gossamer Threads Inc. All Rights Reserved.
# Redistribution in part or in whole strictly prohibited. Please
# see LICENSE file for full details.
# ==================================================================
package Links::Import::RDFS2;
use 5.004_04;
use strict;
use vars qw/$Warning_Code $Critical_Code $Mild_Code $Print_Out/;
use DBI;
use GT::SQL;
use GT::RDF;
sub critical {
$Critical_Code->(@_);
}
sub warning {
$Warning_Code->(@_);
}
sub mild_warning {
ref $Mild_Code eq 'CODE' and $Mild_Code->(@_);
}
sub import_print {
if (ref $Print_Out eq 'CODE') {
$Print_Out->(@_);
}
else {
print @_;
}
}
# Takes 3-4 arguments: hash reference, 2 or 3 code refs
# The hash reference is the options hash for an import.
# The first code reference will be called when a warning occurs.
# The second code reference will be called when a critical error occurs.
# If provided, the third code reference will be called when a mild warning occurs
sub import {
my $opt = shift;
return if ref $opt ne 'HASH';
{
my $warning = shift;
return if ref $warning ne 'CODE';
$Warning_Code = $warning;
my $critical = shift;
return if ref $critical ne 'CODE';
$Critical_Code = $critical;
my $mild = shift;
$Mild_Code = $mild if ref $mild eq 'CODE';
my $output = shift;
$Print_Out = $output if ref $output eq 'CODE';
}
my $DB = new GT::SQL(def_path => $$opt{destination}, subclass => 0);
my $e_prefix = $DB->prefix;
my $e_dbh;
{
my $table = $DB->table("Links");
$table->connect();
$e_dbh = $table->{driver}->connect();
}
my $Links_counter;
my $Category_counter;
my $odbc = 0;
if (($DB->table('Links')->{connect}{driver} || "") eq "ODBC") {
$odbc = 1;
# Set max read properties for DBI.
$e_dbh->{LongReadLen} = 1000000;
}
if ($$opt{clear_tables}) {
# Delete everything from all tables, EXCEPT for the `admin' user from the Users table.
# Also ignore --rdf-user if specified.
if ($$opt{rdf_user}) {
my $sth = $e_dbh->prepare("SELECT COUNT(*) FROM ${e_prefix}Users WHERE Username = ?") or critical "Unable to prepare query `SELECT COUNT(*) FROM ${e_prefix}Users WHERE Username = ?': ".$e_dbh->errstr;
$sth->execute($$opt{rdf_user}) or critical "Unable to execute query `SELECT COUNT(*) FROM ${e_prefix}Users WHERE Username = ?': ".$sth->errstr;
if ($sth->fetchrow_array) {
$e_dbh->do("DELETE FROM ${e_prefix}Users WHERE Username <> 'admin' AND Username <> ".$e_dbh->quote($$opt{rdf_user}));
}
else {
critical "The rdf username that you specified ($$opt{rdf_user}) does not exist. Please create the user.";
}
}
else {
$e_dbh->do("DELETE FROM ${e_prefix}Users WHERE Username <> 'admin'") or critical "Unable to delete all existing users: ".$e_dbh->errstr;
}
for (qw/Links Category CatLinks CatRelations Category_Score_List
Category_Word_List ClickTrack Editors EmailMailings EmailTemplates
Links_Score_List Links_Word_List MailingIndex MailingList
MailingListIndex Verify/) {
$e_dbh->do("DELETE FROM $e_prefix$_");
}
unless ($$opt{straight_import}) {
$Links_counter = $Category_counter = 0;
}
}
else {
my $sth = $e_dbh->prepare("SELECT MAX(ID) FROM ${e_prefix}Links") or critical "Unable to prepare query `SELECT MAX(ID) FROM ${e_prefix}Links': ".$e_dbh->errstr;
$sth->execute() or critical "Unable to execute query `SELECT MAX(ID) FROM ${e_prefix}Links': ".$sth->errstr;
$Links_counter = $sth->fetchrow_array;
$sth->finish();
$sth = $e_dbh->prepare("SELECT MAX(ID) FROM ${e_prefix}Category") or critical "Unable to prepare query `SELECT MAX(ID) FROM ${e_prefix}Category': ".$e_dbh->errstr;
$sth->execute() or critical "Unable to execute query `SELECT MAX(ID) FROM ${e_prefix}Category': ".$sth->errstr;
$Category_counter = $sth->fetchrow_array;
$sth->finish();
}
my $gzip = $$opt{with_gzip};
my $need_gz = -B $$opt{source}; # Tests if the file is binary.
my $is_win = $^O =~ /win/i;
my $gzip_filename = $is_win ? "gzip.exe" : "gzip";
if (not $gzip and $need_gz) {
# Try to find gzip
my $dir_sep = $is_win ? ";" : ":";
my @locations = split /$dir_sep/, $ENV{PATH};
for (@locations) {
-x "$_/$gzip_filename" and $gzip = "$_/$gzip_filename", last;
}
$gzip or critical "\nUnable to locate gzip (Searched @locations). Please specify with --with-gzip=\"/path/to/gzip\"";
}
elsif ($gzip and -d $gzip) {
if (-x "$gzip/$gzip_filename") {
$gzip = "$gzip/$gzip_filename";
}
else {
critical "\nThe directory $gzip does not contain a valid gzip program";
}
}
if ($need_gz) {
-x $gzip or critical "\nUnable to find an executable gzip";
}
my $rdf = \do { local *FH; *FH };
if ($$opt{with_gzip} or $need_gz) {
import_print "\nOpening uncompressed stream from gzip compressed file `$$opt{source}' ...\n";
open $rdf, " $gzip -c -d $$opt{source} |" or critical "Unable to open `$gzip -c -d $$opt{source} |': $!";
}
else {
import_print "\nOpening stream from non-compressed file `$$opt{source}' ...\n";
open $rdf, "<$$opt{source}" or critical "Unable to open $$opt{source}: $!";
}
import_print "Stream opened.\n";
# Do the import
{
my $count_cats_sth = $e_dbh->prepare("SELECT COUNT(*) FROM ${e_prefix}Category WHERE Full_Name = ?") or critical "Unable to prepare query `SELECT COUNT(*) FROM ${e_prefix}Category WHERE Full_Name = ?': ".$e_dbh->errstr;
my $cat_id_sth = $e_dbh->prepare("SELECT ID FROM ${e_prefix}Category WHERE Full_Name = ?") or critical "Unable to prepare query `SELECT ID FROM ${e_prefix}Category WHERE Full_Name = ?': ".$e_dbh->errstr;
my $cat_ins_sth = $odbc
? $e_dbh->prepare("SET IDENTITY_INSERT ${e_prefix}Category ON; INSERT INTO ${e_prefix}Category (ID, Name, Full_Name, FatherID, Number_of_Links, Has_New_Links, Has_Changed_Links) VALUES (?, ?, ?, ?, 0, 'No', 'No')")
: $e_dbh->prepare("INSERT INTO ${e_prefix}Category (ID, Name, Full_Name, FatherID, Number_of_Links, Has_New_Links, Has_Changed_Links) VALUES (?, ?, ?, ?, 0, 'No', 'No')");
$cat_ins_sth or critical "Unable to prepare query `INSERT INTO ${e_prefix}Category (ID, Name, Full_Name, FatherID) VALUES (?, ?, ?, ?)': ".$e_dbh->errstr;
my $sub_cats_sth = $e_dbh->prepare("UPDATE ${e_prefix}Category SET FatherID = ? WHERE Full_Name LIKE ? AND Full_Name NOT LIKE ?") or critical "Unable to prepare query `UPDATE ${e_prefix}Category SET FatherID = ? WHERE Full_Name LIKE ? AND Full_Name NOT LIKE ?': ".$e_dbh->errstr;
my $get_num_links_sth = $e_dbh->prepare("SELECT COUNT(*) FROM ${e_prefix}CatLinks WHERE CategoryID = ?") or critical "Unable to prepare query `SELECT COUNT(*) FROM ${e_prefix}CatLinks WHERE CategoryID = ?': ".$e_dbh->errstr;
my $num_links_sth = $e_dbh->prepare("UPDATE ${e_prefix}Category SET Number_of_Links = ? WHERE ID = ?") or critical "Unable to prepare query `UPDATE ${e_prefix}Category SET Number_of_Links = ? WHERE ID = ?': ".$e_dbh->errstr;
my $insert_link_sth = $odbc
? $e_dbh->prepare("SET IDENTITY_INSERT ${e_prefix}Links ON; INSERT INTO ${e_prefix}Links (ID, Title, URL, Add_Date, Mod_Date, Description, LinkOwner, Status, Votes, Rating, Hits) VALUES (?, ?, ?, ?, ?, ?, ".$e_dbh->quote($$opt{rdf_user} or "admin").",0,0,0,0)")
: $e_dbh->prepare("INSERT INTO ${e_prefix}Links (ID, Title, URL, Add_Date, Mod_Date, Description, LinkOwner, Status, Votes, Rating, Hits) VALUES (?, ?, ?, ?, ?, ?, ".$e_dbh->quote($$opt{rdf_user} or "admin").",0,0,0,0)");
$insert_link_sth or critical "Unable to prepare query `INSERT INTO ${e_prefix}Links (Title, URL, Add_Date, Mod_Date, Description, LinkOwner) VALUES (?, ?, ?, ?, ?, ?)': ".$e_dbh->errstr;
my $link_exists_sth = $e_dbh->prepare("SELECT COUNT(*) FROM ${e_prefix}Links, ${e_prefix}CatLinks WHERE URL = ? AND CategoryID = ? AND ID = LinkID") or critical "Unable to prepare query `SELECT * FROM Links, CatLinks WHERE URL = ? AND CategoryID = ? AND ID = LinkID': ".$e_dbh->errstr;
my $cat_links_sth = $e_dbh->prepare("INSERT INTO ${e_prefix}CatLinks (LinkID, CategoryID) VALUES (?, ?)") or critical "Unable to prepare query `INSERT INTO ${e_prefix}CatLinks (LinkID, CategoryID) VALUES (?, ?)': ".$e_dbh->errstr;
my $count_links_sth = $e_dbh->prepare("SELECT COUNT(*) FROM ${e_prefix}CatLinks WHERE CategoryID = ?") or critical "Unable to prepare query `SELECT COUNT(*) FROM ${e_prefix}CatLinks WHERE CategoryID = ?': ".$e_dbh->errstr;
import_print "\nImporting from $$opt{source}\n";
my $links_imported = 0;
my $cats_imported = 0;
my %cat_needs_num; # $cat_needs_num{cat_id} = 1 if the category needs to have its Number_Of_Links updated
my $base_cat = $$opt{rdf_destination};
my $base_cat_id;
if (defined $base_cat) {
$base_cat =~ s|//+|/|g; # Remove doubled (and more) slashes
$base_cat =~ s|^/+||; # Remove any leading slashes
$base_cat =~ s|/+$||; # And any trailing ones
}
else {
$base_cat = "";
}
if (length $base_cat) {
$count_cats_sth->execute($base_cat) or critical "Unable to execute query `SELECT COUNT(*) FROM ${e_prefix}Category WHERE Full_Name = ?': ".$count_cats_sth->errstr;
if ($count_cats_sth->fetchrow_array) {
$cat_id_sth->execute($base_cat) or critical "Unable to execute query `SELECT ID FROM ${e_prefix}Category WHERE Full_Name = ?': ".$cat_id_sth->errstr;
$base_cat_id = $cat_id_sth->fetchrow_array;
}
else { # Category doesn't exist
my @missing_cats = $base_cat;
mild_warning "$base_cat does not exist and is being created";
my $fn = $base_cat;
while ($fn =~ s[/[^/]*\Z][]) {
$count_cats_sth->execute($fn) or critical "Unable to execute query `SELECT COUNT(*) FROM ${e_prefix}Category WHERE Full_Name = ?': ".$count_cats_sth->errstr;
if ($count_cats_sth->fetchrow_array) { # It exists
last;
}
else {
unshift @missing_cats, $fn;
mild_warning "$fn is needed for base category $base_cat and does not exist. It will be created";
}
}
for (@missing_cats) {
my ($name) = m[([^/]+)\Z];
my ($father_full) = m[\A(.*)/];
my $father_id;
if ($father_full) {
$cat_id_sth->execute($father_full) or critical "Unable to execute query `SELECT ID FROM ${e_prefix}Category WHERE Full_Name = ?': ".$cat_id_sth->errstr;
$father_id = $cat_id_sth->fetchrow_array;
}
else { # Must be the root category
$father_id = 0;
}
$cat_ins_sth->execute(++$Category_counter,$name,$_,$father_id) or critical "Unable to create missing category $_: ".$cat_ins_sth->errstr;
$cats_imported++;
$sub_cats_sth->execute($Category_counter,"$_/%","$_/%/%") or critical "Unable to execute query `UPDATE ${e_prefix}Category SET FatherID = ? WHERE Full_Name LIKE ? and Full_Name NOT LIKE ?': ".$sub_cats_sth->errstr;
}
$base_cat_id = $Category_counter; # The last one inserted will be the category import base
}
}
else {
$base_cat_id = 0;
}
my $cat = $$opt{rdf_category};
# -------------------------------------------------------------------
# Main code, get a parser object and start parsing!
#
# New for 2.2.0 - XML::Parser-based code, which should be significantly faster.
# It should, however, still be considered experimental.
#
if ($$opt{xml_parser}) {
require XML::Parser;
my (%links, %want, %current, @in);
my $last_status = -1;
my $insert_cat = sub {
my $cat_name = $current{category};
# If we are under ODBC we need to reset the sth handle to avoid a "Invalid Cursor State" error.
$odbc and $count_cats_sth->finish;
# Check to make sure we haven't seen this category before
# Set $found to the category ID.
$count_cats_sth->execute($cat_name) or critical "Execute: $DBI::errstr";
if ($count_cats_sth->fetchrow_array) {
$cat_id_sth->finish if ($odbc); # Need to reset cursor under odbc.
$cat_id_sth->execute($cat_name) or critical "Execute: $DBI::errstr";
$current{cat_id} = $cat_id_sth->fetchrow_array;
}
else {
my ($father_name, $short_name) = $current{category} =~ m|(?:(.*)/)?([^/]+)$|;
my $father_id;
if ($father_name) {
$cat_id_sth->finish if ($odbc); # Need to reset cursor under odbc.
$cat_id_sth->execute($father_name) or critical "Execute: $DBI::errstr";
$father_id = $cat_id_sth->fetchrow_array;
}
else {
$father_id = 0;
}
$cat_ins_sth->execute(++$Category_counter, $short_name, $current{category}, $father_id) or critical "Execute: $DBI::errstr";
$cats_imported++;
$current{cat_id} = $Category_counter;
}
for my $link (keys %links) {
# Either append, or insert new link.
if ($$opt{rdf_update}) {
$link_exists_sth->execute($link, $current{cat_id}) or critical "Execute: $DBI::errstr";
next if $link_exists_sth->fetchrow;
}
# Title can only be 100 characters (ODBC fatals about data that is too long).
my $title = substr($links{$link}->{title} || $link, 0, 100);
$insert_link_sth->execute(++$Links_counter, $title, $link, $$opt{rdf_add_date}, $$opt{rdf_add_date}, $links{$link}->{description} || "") or critical "Execute: $DBI::errstr";
$cat_links_sth->execute($Links_counter, $current{cat_id}) or critical "Execute: $DBI::errstr";
$cat_needs_num{$current{cat_id}} = 1;
$links_imported++;
}
%links = ();
return scalar keys %links;
};
my $parser = XML::Parser->new(
Handlers => {
Start => sub {
my ($parser, $tag, %attr) = @_;
if ($tag eq 'Topic') {
{
my $disp_topic = $attr{'r:id'};
substr($disp_topic, 30) = '...' if length $disp_topic > 33;
my $padding = " " x (33 - length $disp_topic);
$disp_topic = "(L:$links_imported, C:$cats_imported) $disp_topic$padding";
import_print("\r$disp_topic");
}
if ($current{category}) {
my $cat_count = $insert_cat->();
import_print "$cat_count ";
}
my $dmoz_cat = $attr{'r:id'};
if ($dmoz_cat =~ /^\Q$cat\E(.*)/) {
my $topic = $base_cat . '/' . $1;
$topic =~ s|/{2,}|/|g;
$topic =~ s|^/||;
$topic =~ s|/$||;
$topic =~ y|_| |;
$current{category} = $topic;
}
else {
delete $current{category};
import_print "skipping";
}
}
elsif ($tag eq 'link' and $in[-1] eq 'Topic' and $current{category} and $attr{'r:resource'}) {
$links{$attr{'r:resource'}} = {};
}
elsif ($tag eq 'ExternalPage' and $current{category} and $attr{about}) {
$current{ExternalPage} = $attr{about};
}
elsif ($tag eq 'd:Title' and $in[-1] eq 'ExternalPage' and $current{ExternalPage}) {
$want{title} = \$links{$current{ExternalPage}}->{title};
}
elsif ($tag eq 'd:Description' and $in[-1] eq 'ExternalPage' and $current{ExternalPage}) {
$want{description} = \$links{$current{ExternalPage}}->{description};
}
push @in, $tag;
},
End => sub {
my ($parser, $tag) = @_;
pop @in;
if ($tag eq 'd:Description') {
delete $want{description};
}
elsif ($tag eq 'd:Title') {
delete $want{title};
}
},
Char => sub {
my ($parser, $text) = @_;
if ($want{title}) {
${$want{title}} = $text;
}
elsif ($want{description}) {
${$want{description}} = $text;
}
}
}
);
$parser->parse($rdf, ProtocolEncoding => 'ISO-8859-1');
}
else {
my $parse = GT::RDF->new (io => $rdf);
my ($found, $was_found, %links);
my $cat_count = 0;
my $first = 0;
while ($parse->parse) {
# Either we have a topic, external page or unknown.
if ($parse->{name} eq 'Topic') {
# Add extra links that did not have external page info.
if (defined $found) {
foreach my $link (keys %links) {
if ($$opt{rdf_update}) {
$link_exists_sth->execute($parse->{attribs}{about},$found) or critical "Execute: $DBI::errstr";
unless ($link_exists_sth->fetchrow_array) {
$insert_link_sth->execute(++$Links_counter, $link, $link, $$opt{rdf_add_date}, $$opt{rdf_add_date}, "") or critical "Execute: $DBI::errstr";
$cat_links_sth->execute($Links_counter, $found) or critical "Execute: $DBI::errstr";
$cat_needs_num{$found} = 1;
$links_imported++;
}
}
else {
$insert_link_sth->execute(++$Links_counter, $link, $link, $$opt{rdf_add_date}, $$opt{rdf_add_date}, "") or critical "Execute: $DBI::errstr";
$cat_links_sth->execute($Links_counter, $found) or critical "Execute: $DBI::errstr";
$cat_needs_num{$found} = 1;
$links_imported++;
}
}
import_print "$cat_count ";
}
else {
import_print "skipping" if $first++;
}
# Clear out our links hash, and set found to undef.
$cat_count = 0;
%links = ();
$was_found = $found;
$found = undef;
# We've finished a topic, start a new one.
my $dmoz_cat = $parse->{attribs}{'r:id'};
if ($dmoz_cat =~ /^\Q$cat\E(.*)/) {
my $topic = $base_cat . '/' . $1;
$topic =~ s|//+|/|g;
$topic =~ s|^/||;
$topic =~ s|/$||;
$topic =~ s|_| |g;
if ($topic) {
# If we are under ODBC we need to reset the sth handle to avoid a "Invalid Cursor State" error.
$odbc and ($count_cats_sth->finish);
# Check to make sure we haven't seen this category before
# Set $found to the category ID.
$count_cats_sth->execute($topic) or critical "Execute: $DBI::errstr";
if ($count_cats_sth->fetchrow_array) {
$cat_id_sth->finish if ($odbc); # Need to reset cursor under odbc.
$cat_id_sth->execute($topic) or critical "Execute: $DBI::errstr";
$found = $cat_id_sth->fetchrow_array;
}
else {
my ($short_name) = $topic =~ m|([^/]+)$|;
my ($father_name) = $topic =~ m|(.*)/|;
my $father_id;
if ($father_name) {
$cat_id_sth->finish if ($odbc); # Need to reset cursor under odbc.
$cat_id_sth->execute($father_name) or critical "Execute: $DBI::errstr";
$father_id = $cat_id_sth->fetchrow_array;
}
else {
$father_id = 0;
}
$cat_ins_sth->execute(++$Category_counter, $short_name, $topic, $father_id) or critical "Execute: $DBI::errstr";
$cats_imported++;
$found = $Category_counter;
}
}
}
{
my $disp_topic = $parse->{attribs}{'r:id'};
substr($disp_topic, 30) = '...' if length $disp_topic > 33;
my $padding = " " x (33 - length $disp_topic);
$disp_topic = "(L:$links_imported, C:$cats_imported) $disp_topic$padding";
import_print("\r$disp_topic");
}
if (defined $found) {
for (@{$parse->{tags}}) {
next unless ($_->{attribs}{'r:resource'});
$links{$_->{attribs}{'r:resource'}} = 1;
$cat_count++;
}
}
else {
return 1 if $was_found;
}
}
elsif (defined ($found) and $parse->{name} eq 'ExternalPage') {
# Remove from our simple link list.
delete $links{$parse->{attribs}{about}};
# Insert with description or title if it does not exist and we are not overwritting
my ($title, $desc);
for (@{$parse->{tags}}) {
if ($_->{name} eq 'd:Title' and $_->{data}) { $title = $_->{data} }
elsif ($_->{name} eq 'd:Description' and $_->{data}) { $desc = $_->{data} }
}
$title ||= $parse->{attribs}{about};
$desc ||= '';
# Either append, or insert new link.
if ($$opt{rdf_update}) {
$link_exists_sth->execute ($parse->{attribs}{about},$found) or critical "Execute: $DBI::errstr";
unless ($link_exists_sth->fetchrow_array) {
# Title can only be 100 characters (ODBC fatals about data that is too long).
$title = substr($title, 0, 100);
$insert_link_sth->execute(++$Links_counter, $title, $parse->{attribs}{about}, $$opt{rdf_add_date}, $$opt{rdf_add_date}, $desc) or critical "Execute: $DBI::errstr";
$cat_links_sth->execute($Links_counter, $found) or critical "Execute: $DBI::errstr";
$cat_needs_num{$found} = 1;
$links_imported++;
}
}
else {
# Title can only be 100 characters (ODBC fatals about data that is too long).
$title = substr($title, 0, 100);
$insert_link_sth->execute(++$Links_counter, $title, $parse->{attribs}{about}, $$opt{rdf_add_date}, $$opt{rdf_add_date}, $desc) or critical "Execute: $DBI::errstr";
$cat_links_sth->execute($Links_counter, $found) or critical "Execute: $DBI::errstr";
$cat_needs_num{$found} = 1;
$links_imported++;
}
}
elsif (defined $found) {
require GT::Dumper;
critical "STRANGE TAG: " . GT::Dumper::Dumper($parse) . "\n";
}
}
}
# Now we have to go through the categories to update each one's Number_Of_Links
for (keys %cat_needs_num) {
$count_links_sth->execute($_) or critical "Unable to count links for Category ID $_: ".$count_links_sth->errstr;
my $links = $count_links_sth->fetchrow_array;
$count_links_sth->finish if ($odbc);
$num_links_sth->execute($links,$_) or critical "Unable to update number of links for Category ID $_: ".$num_links_sth->errstr;
}
}
import_print "\nNOTE: You must run Rebuild Cat. tree, Repair Tables, and Rebuild Search after performing an import!\n";
$e_dbh->disconnect;
1;
}
'"I am lying," said the man. Was he?';