# ================================================================== # Gossamer Links - enhanced directory management system # # Website : http://gossamer-threads.com/ # Support : http://gossamer-threads.com/scripts/support/ # CVS Info : 087,071,086,086,085 # Revision : $Id: RDFS2.pm,v 1.20 2005/04/07 19:34:41 brewt Exp $ # # Copyright (c) 2001 Gossamer Threads Inc. All Rights Reserved. # Redistribution in part or in whole strictly prohibited. Please # see LICENSE file for full details. # ================================================================== package Links::Import::RDFS2; use 5.004_04; use strict; use vars qw/$Warning_Code $Critical_Code $Mild_Code $Print_Out/; use DBI; use GT::SQL; use GT::RDF; sub critical { $Critical_Code->(@_); } sub warning { $Warning_Code->(@_); } sub mild_warning { ref $Mild_Code eq 'CODE' and $Mild_Code->(@_); } sub import_print { if (ref $Print_Out eq 'CODE') { $Print_Out->(@_); } else { print @_; } } # Takes 3-4 arguments: hash reference, 2 or 3 code refs # The hash reference is the options hash for an import. # The first code reference will be called when a warning occurs. # The second code reference will be called when a critical error occurs. # If provided, the third code reference will be called when a mild warning occurs sub import { my $opt = shift; return if ref $opt ne 'HASH'; { my $warning = shift; return if ref $warning ne 'CODE'; $Warning_Code = $warning; my $critical = shift; return if ref $critical ne 'CODE'; $Critical_Code = $critical; my $mild = shift; $Mild_Code = $mild if ref $mild eq 'CODE'; my $output = shift; $Print_Out = $output if ref $output eq 'CODE'; } my $DB = new GT::SQL(def_path => $$opt{destination}, subclass => 0); my $e_prefix = $DB->prefix; my $e_dbh; { my $table = $DB->table("Links"); $table->connect(); $e_dbh = $table->{driver}->connect(); } my $Links_counter; my $Category_counter; my $odbc = 0; if (($DB->table('Links')->{connect}{driver} || "") eq "ODBC") { $odbc = 1; # Set max read properties for DBI. $e_dbh->{LongReadLen} = 1000000; } if ($$opt{clear_tables}) { # Delete everything from all tables, EXCEPT for the `admin' user from the Users table. # Also ignore --rdf-user if specified. if ($$opt{rdf_user}) { my $sth = $e_dbh->prepare("SELECT COUNT(*) FROM ${e_prefix}Users WHERE Username = ?") or critical "Unable to prepare query `SELECT COUNT(*) FROM ${e_prefix}Users WHERE Username = ?': ".$e_dbh->errstr; $sth->execute($$opt{rdf_user}) or critical "Unable to execute query `SELECT COUNT(*) FROM ${e_prefix}Users WHERE Username = ?': ".$sth->errstr; if ($sth->fetchrow_array) { $e_dbh->do("DELETE FROM ${e_prefix}Users WHERE Username <> 'admin' AND Username <> ".$e_dbh->quote($$opt{rdf_user})); } else { critical "The rdf username that you specified ($$opt{rdf_user}) does not exist. Please create the user."; } } else { $e_dbh->do("DELETE FROM ${e_prefix}Users WHERE Username <> 'admin'") or critical "Unable to delete all existing users: ".$e_dbh->errstr; } for (qw/Links Category CatLinks CatRelations Category_Score_List Category_Word_List ClickTrack Editors EmailMailings EmailTemplates Links_Score_List Links_Word_List MailingIndex MailingList MailingListIndex Verify/) { $e_dbh->do("DELETE FROM $e_prefix$_"); } unless ($$opt{straight_import}) { $Links_counter = $Category_counter = 0; } } else { my $sth = $e_dbh->prepare("SELECT MAX(ID) FROM ${e_prefix}Links") or critical "Unable to prepare query `SELECT MAX(ID) FROM ${e_prefix}Links': ".$e_dbh->errstr; $sth->execute() or critical "Unable to execute query `SELECT MAX(ID) FROM ${e_prefix}Links': ".$sth->errstr; $Links_counter = $sth->fetchrow_array; $sth->finish(); $sth = $e_dbh->prepare("SELECT MAX(ID) FROM ${e_prefix}Category") or critical "Unable to prepare query `SELECT MAX(ID) FROM ${e_prefix}Category': ".$e_dbh->errstr; $sth->execute() or critical "Unable to execute query `SELECT MAX(ID) FROM ${e_prefix}Category': ".$sth->errstr; $Category_counter = $sth->fetchrow_array; $sth->finish(); } my $gzip = $$opt{with_gzip}; my $need_gz = -B $$opt{source}; # Tests if the file is binary. my $is_win = $^O =~ /win/i; my $gzip_filename = $is_win ? "gzip.exe" : "gzip"; if (not $gzip and $need_gz) { # Try to find gzip my $dir_sep = $is_win ? ";" : ":"; my @locations = split /$dir_sep/, $ENV{PATH}; for (@locations) { -x "$_/$gzip_filename" and $gzip = "$_/$gzip_filename", last; } $gzip or critical "\nUnable to locate gzip (Searched @locations). Please specify with --with-gzip=\"/path/to/gzip\""; } elsif ($gzip and -d $gzip) { if (-x "$gzip/$gzip_filename") { $gzip = "$gzip/$gzip_filename"; } else { critical "\nThe directory $gzip does not contain a valid gzip program"; } } if ($need_gz) { -x $gzip or critical "\nUnable to find an executable gzip"; } my $rdf = \do { local *FH; *FH }; if ($$opt{with_gzip} or $need_gz) { import_print "\nOpening uncompressed stream from gzip compressed file `$$opt{source}' ...\n"; open $rdf, " $gzip -c -d $$opt{source} |" or critical "Unable to open `$gzip -c -d $$opt{source} |': $!"; } else { import_print "\nOpening stream from non-compressed file `$$opt{source}' ...\n"; open $rdf, "<$$opt{source}" or critical "Unable to open $$opt{source}: $!"; } import_print "Stream opened.\n"; # Do the import { my $count_cats_sth = $e_dbh->prepare("SELECT COUNT(*) FROM ${e_prefix}Category WHERE Full_Name = ?") or critical "Unable to prepare query `SELECT COUNT(*) FROM ${e_prefix}Category WHERE Full_Name = ?': ".$e_dbh->errstr; my $cat_id_sth = $e_dbh->prepare("SELECT ID FROM ${e_prefix}Category WHERE Full_Name = ?") or critical "Unable to prepare query `SELECT ID FROM ${e_prefix}Category WHERE Full_Name = ?': ".$e_dbh->errstr; my $cat_ins_sth = $odbc ? $e_dbh->prepare("SET IDENTITY_INSERT ${e_prefix}Category ON; INSERT INTO ${e_prefix}Category (ID, Name, Full_Name, FatherID, Number_of_Links, Has_New_Links, Has_Changed_Links) VALUES (?, ?, ?, ?, 0, 'No', 'No')") : $e_dbh->prepare("INSERT INTO ${e_prefix}Category (ID, Name, Full_Name, FatherID, Number_of_Links, Has_New_Links, Has_Changed_Links) VALUES (?, ?, ?, ?, 0, 'No', 'No')"); $cat_ins_sth or critical "Unable to prepare query `INSERT INTO ${e_prefix}Category (ID, Name, Full_Name, FatherID) VALUES (?, ?, ?, ?)': ".$e_dbh->errstr; my $sub_cats_sth = $e_dbh->prepare("UPDATE ${e_prefix}Category SET FatherID = ? WHERE Full_Name LIKE ? AND Full_Name NOT LIKE ?") or critical "Unable to prepare query `UPDATE ${e_prefix}Category SET FatherID = ? WHERE Full_Name LIKE ? AND Full_Name NOT LIKE ?': ".$e_dbh->errstr; my $get_num_links_sth = $e_dbh->prepare("SELECT COUNT(*) FROM ${e_prefix}CatLinks WHERE CategoryID = ?") or critical "Unable to prepare query `SELECT COUNT(*) FROM ${e_prefix}CatLinks WHERE CategoryID = ?': ".$e_dbh->errstr; my $num_links_sth = $e_dbh->prepare("UPDATE ${e_prefix}Category SET Number_of_Links = ? WHERE ID = ?") or critical "Unable to prepare query `UPDATE ${e_prefix}Category SET Number_of_Links = ? WHERE ID = ?': ".$e_dbh->errstr; my $insert_link_sth = $odbc ? $e_dbh->prepare("SET IDENTITY_INSERT ${e_prefix}Links ON; INSERT INTO ${e_prefix}Links (ID, Title, URL, Add_Date, Mod_Date, Description, LinkOwner, Status, Votes, Rating, Hits) VALUES (?, ?, ?, ?, ?, ?, ".$e_dbh->quote($$opt{rdf_user} or "admin").",0,0,0,0)") : $e_dbh->prepare("INSERT INTO ${e_prefix}Links (ID, Title, URL, Add_Date, Mod_Date, Description, LinkOwner, Status, Votes, Rating, Hits) VALUES (?, ?, ?, ?, ?, ?, ".$e_dbh->quote($$opt{rdf_user} or "admin").",0,0,0,0)"); $insert_link_sth or critical "Unable to prepare query `INSERT INTO ${e_prefix}Links (Title, URL, Add_Date, Mod_Date, Description, LinkOwner) VALUES (?, ?, ?, ?, ?, ?)': ".$e_dbh->errstr; my $link_exists_sth = $e_dbh->prepare("SELECT COUNT(*) FROM ${e_prefix}Links, ${e_prefix}CatLinks WHERE URL = ? AND CategoryID = ? AND ID = LinkID") or critical "Unable to prepare query `SELECT * FROM Links, CatLinks WHERE URL = ? AND CategoryID = ? AND ID = LinkID': ".$e_dbh->errstr; my $cat_links_sth = $e_dbh->prepare("INSERT INTO ${e_prefix}CatLinks (LinkID, CategoryID) VALUES (?, ?)") or critical "Unable to prepare query `INSERT INTO ${e_prefix}CatLinks (LinkID, CategoryID) VALUES (?, ?)': ".$e_dbh->errstr; my $count_links_sth = $e_dbh->prepare("SELECT COUNT(*) FROM ${e_prefix}CatLinks WHERE CategoryID = ?") or critical "Unable to prepare query `SELECT COUNT(*) FROM ${e_prefix}CatLinks WHERE CategoryID = ?': ".$e_dbh->errstr; import_print "\nImporting from $$opt{source}\n"; my $links_imported = 0; my $cats_imported = 0; my %cat_needs_num; # $cat_needs_num{cat_id} = 1 if the category needs to have its Number_Of_Links updated my $base_cat = $$opt{rdf_destination}; my $base_cat_id; if (defined $base_cat) { $base_cat =~ s|//+|/|g; # Remove doubled (and more) slashes $base_cat =~ s|^/+||; # Remove any leading slashes $base_cat =~ s|/+$||; # And any trailing ones } else { $base_cat = ""; } if (length $base_cat) { $count_cats_sth->execute($base_cat) or critical "Unable to execute query `SELECT COUNT(*) FROM ${e_prefix}Category WHERE Full_Name = ?': ".$count_cats_sth->errstr; if ($count_cats_sth->fetchrow_array) { $cat_id_sth->execute($base_cat) or critical "Unable to execute query `SELECT ID FROM ${e_prefix}Category WHERE Full_Name = ?': ".$cat_id_sth->errstr; $base_cat_id = $cat_id_sth->fetchrow_array; } else { # Category doesn't exist my @missing_cats = $base_cat; mild_warning "$base_cat does not exist and is being created"; my $fn = $base_cat; while ($fn =~ s[/[^/]*\Z][]) { $count_cats_sth->execute($fn) or critical "Unable to execute query `SELECT COUNT(*) FROM ${e_prefix}Category WHERE Full_Name = ?': ".$count_cats_sth->errstr; if ($count_cats_sth->fetchrow_array) { # It exists last; } else { unshift @missing_cats, $fn; mild_warning "$fn is needed for base category $base_cat and does not exist. It will be created"; } } for (@missing_cats) { my ($name) = m[([^/]+)\Z]; my ($father_full) = m[\A(.*)/]; my $father_id; if ($father_full) { $cat_id_sth->execute($father_full) or critical "Unable to execute query `SELECT ID FROM ${e_prefix}Category WHERE Full_Name = ?': ".$cat_id_sth->errstr; $father_id = $cat_id_sth->fetchrow_array; } else { # Must be the root category $father_id = 0; } $cat_ins_sth->execute(++$Category_counter,$name,$_,$father_id) or critical "Unable to create missing category $_: ".$cat_ins_sth->errstr; $cats_imported++; $sub_cats_sth->execute($Category_counter,"$_/%","$_/%/%") or critical "Unable to execute query `UPDATE ${e_prefix}Category SET FatherID = ? WHERE Full_Name LIKE ? and Full_Name NOT LIKE ?': ".$sub_cats_sth->errstr; } $base_cat_id = $Category_counter; # The last one inserted will be the category import base } } else { $base_cat_id = 0; } my $cat = $$opt{rdf_category}; # ------------------------------------------------------------------- # Main code, get a parser object and start parsing! # # New for 2.2.0 - XML::Parser-based code, which should be significantly faster. # It should, however, still be considered experimental. # if ($$opt{xml_parser}) { require XML::Parser; my (%links, %want, %current, @in); my $last_status = -1; my $insert_cat = sub { my $cat_name = $current{category}; # If we are under ODBC we need to reset the sth handle to avoid a "Invalid Cursor State" error. $odbc and $count_cats_sth->finish; # Check to make sure we haven't seen this category before # Set $found to the category ID. $count_cats_sth->execute($cat_name) or critical "Execute: $DBI::errstr"; if ($count_cats_sth->fetchrow_array) { $cat_id_sth->finish if ($odbc); # Need to reset cursor under odbc. $cat_id_sth->execute($cat_name) or critical "Execute: $DBI::errstr"; $current{cat_id} = $cat_id_sth->fetchrow_array; } else { my ($father_name, $short_name) = $current{category} =~ m|(?:(.*)/)?([^/]+)$|; my $father_id; if ($father_name) { $cat_id_sth->finish if ($odbc); # Need to reset cursor under odbc. $cat_id_sth->execute($father_name) or critical "Execute: $DBI::errstr"; $father_id = $cat_id_sth->fetchrow_array; } else { $father_id = 0; } $cat_ins_sth->execute(++$Category_counter, $short_name, $current{category}, $father_id) or critical "Execute: $DBI::errstr"; $cats_imported++; $current{cat_id} = $Category_counter; } for my $link (keys %links) { # Either append, or insert new link. if ($$opt{rdf_update}) { $link_exists_sth->execute($link, $current{cat_id}) or critical "Execute: $DBI::errstr"; next if $link_exists_sth->fetchrow; } # Title can only be 100 characters (ODBC fatals about data that is too long). my $title = substr($links{$link}->{title} || $link, 0, 100); $insert_link_sth->execute(++$Links_counter, $title, $link, $$opt{rdf_add_date}, $$opt{rdf_add_date}, $links{$link}->{description} || "") or critical "Execute: $DBI::errstr"; $cat_links_sth->execute($Links_counter, $current{cat_id}) or critical "Execute: $DBI::errstr"; $cat_needs_num{$current{cat_id}} = 1; $links_imported++; } %links = (); return scalar keys %links; }; my $parser = XML::Parser->new( Handlers => { Start => sub { my ($parser, $tag, %attr) = @_; if ($tag eq 'Topic') { { my $disp_topic = $attr{'r:id'}; substr($disp_topic, 30) = '...' if length $disp_topic > 33; my $padding = " " x (33 - length $disp_topic); $disp_topic = "(L:$links_imported, C:$cats_imported) $disp_topic$padding"; import_print("\r$disp_topic"); } if ($current{category}) { my $cat_count = $insert_cat->(); import_print "$cat_count "; } my $dmoz_cat = $attr{'r:id'}; if ($dmoz_cat =~ /^\Q$cat\E(.*)/) { my $topic = $base_cat . '/' . $1; $topic =~ s|/{2,}|/|g; $topic =~ s|^/||; $topic =~ s|/$||; $topic =~ y|_| |; $current{category} = $topic; } else { delete $current{category}; import_print "skipping"; } } elsif ($tag eq 'link' and $in[-1] eq 'Topic' and $current{category} and $attr{'r:resource'}) { $links{$attr{'r:resource'}} = {}; } elsif ($tag eq 'ExternalPage' and $current{category} and $attr{about}) { $current{ExternalPage} = $attr{about}; } elsif ($tag eq 'd:Title' and $in[-1] eq 'ExternalPage' and $current{ExternalPage}) { $want{title} = \$links{$current{ExternalPage}}->{title}; } elsif ($tag eq 'd:Description' and $in[-1] eq 'ExternalPage' and $current{ExternalPage}) { $want{description} = \$links{$current{ExternalPage}}->{description}; } push @in, $tag; }, End => sub { my ($parser, $tag) = @_; pop @in; if ($tag eq 'd:Description') { delete $want{description}; } elsif ($tag eq 'd:Title') { delete $want{title}; } }, Char => sub { my ($parser, $text) = @_; if ($want{title}) { ${$want{title}} = $text; } elsif ($want{description}) { ${$want{description}} = $text; } } } ); $parser->parse($rdf, ProtocolEncoding => 'ISO-8859-1'); } else { my $parse = GT::RDF->new (io => $rdf); my ($found, $was_found, %links); my $cat_count = 0; my $first = 0; while ($parse->parse) { # Either we have a topic, external page or unknown. if ($parse->{name} eq 'Topic') { # Add extra links that did not have external page info. if (defined $found) { foreach my $link (keys %links) { if ($$opt{rdf_update}) { $link_exists_sth->execute($parse->{attribs}{about},$found) or critical "Execute: $DBI::errstr"; unless ($link_exists_sth->fetchrow_array) { $insert_link_sth->execute(++$Links_counter, $link, $link, $$opt{rdf_add_date}, $$opt{rdf_add_date}, "") or critical "Execute: $DBI::errstr"; $cat_links_sth->execute($Links_counter, $found) or critical "Execute: $DBI::errstr"; $cat_needs_num{$found} = 1; $links_imported++; } } else { $insert_link_sth->execute(++$Links_counter, $link, $link, $$opt{rdf_add_date}, $$opt{rdf_add_date}, "") or critical "Execute: $DBI::errstr"; $cat_links_sth->execute($Links_counter, $found) or critical "Execute: $DBI::errstr"; $cat_needs_num{$found} = 1; $links_imported++; } } import_print "$cat_count "; } else { import_print "skipping" if $first++; } # Clear out our links hash, and set found to undef. $cat_count = 0; %links = (); $was_found = $found; $found = undef; # We've finished a topic, start a new one. my $dmoz_cat = $parse->{attribs}{'r:id'}; if ($dmoz_cat =~ /^\Q$cat\E(.*)/) { my $topic = $base_cat . '/' . $1; $topic =~ s|//+|/|g; $topic =~ s|^/||; $topic =~ s|/$||; $topic =~ s|_| |g; if ($topic) { # If we are under ODBC we need to reset the sth handle to avoid a "Invalid Cursor State" error. $odbc and ($count_cats_sth->finish); # Check to make sure we haven't seen this category before # Set $found to the category ID. $count_cats_sth->execute($topic) or critical "Execute: $DBI::errstr"; if ($count_cats_sth->fetchrow_array) { $cat_id_sth->finish if ($odbc); # Need to reset cursor under odbc. $cat_id_sth->execute($topic) or critical "Execute: $DBI::errstr"; $found = $cat_id_sth->fetchrow_array; } else { my ($short_name) = $topic =~ m|([^/]+)$|; my ($father_name) = $topic =~ m|(.*)/|; my $father_id; if ($father_name) { $cat_id_sth->finish if ($odbc); # Need to reset cursor under odbc. $cat_id_sth->execute($father_name) or critical "Execute: $DBI::errstr"; $father_id = $cat_id_sth->fetchrow_array; } else { $father_id = 0; } $cat_ins_sth->execute(++$Category_counter, $short_name, $topic, $father_id) or critical "Execute: $DBI::errstr"; $cats_imported++; $found = $Category_counter; } } } { my $disp_topic = $parse->{attribs}{'r:id'}; substr($disp_topic, 30) = '...' if length $disp_topic > 33; my $padding = " " x (33 - length $disp_topic); $disp_topic = "(L:$links_imported, C:$cats_imported) $disp_topic$padding"; import_print("\r$disp_topic"); } if (defined $found) { for (@{$parse->{tags}}) { next unless ($_->{attribs}{'r:resource'}); $links{$_->{attribs}{'r:resource'}} = 1; $cat_count++; } } else { return 1 if $was_found; } } elsif (defined ($found) and $parse->{name} eq 'ExternalPage') { # Remove from our simple link list. delete $links{$parse->{attribs}{about}}; # Insert with description or title if it does not exist and we are not overwritting my ($title, $desc); for (@{$parse->{tags}}) { if ($_->{name} eq 'd:Title' and $_->{data}) { $title = $_->{data} } elsif ($_->{name} eq 'd:Description' and $_->{data}) { $desc = $_->{data} } } $title ||= $parse->{attribs}{about}; $desc ||= ''; # Either append, or insert new link. if ($$opt{rdf_update}) { $link_exists_sth->execute ($parse->{attribs}{about},$found) or critical "Execute: $DBI::errstr"; unless ($link_exists_sth->fetchrow_array) { # Title can only be 100 characters (ODBC fatals about data that is too long). $title = substr($title, 0, 100); $insert_link_sth->execute(++$Links_counter, $title, $parse->{attribs}{about}, $$opt{rdf_add_date}, $$opt{rdf_add_date}, $desc) or critical "Execute: $DBI::errstr"; $cat_links_sth->execute($Links_counter, $found) or critical "Execute: $DBI::errstr"; $cat_needs_num{$found} = 1; $links_imported++; } } else { # Title can only be 100 characters (ODBC fatals about data that is too long). $title = substr($title, 0, 100); $insert_link_sth->execute(++$Links_counter, $title, $parse->{attribs}{about}, $$opt{rdf_add_date}, $$opt{rdf_add_date}, $desc) or critical "Execute: $DBI::errstr"; $cat_links_sth->execute($Links_counter, $found) or critical "Execute: $DBI::errstr"; $cat_needs_num{$found} = 1; $links_imported++; } } elsif (defined $found) { require GT::Dumper; critical "STRANGE TAG: " . GT::Dumper::Dumper($parse) . "\n"; } } } # Now we have to go through the categories to update each one's Number_Of_Links for (keys %cat_needs_num) { $count_links_sth->execute($_) or critical "Unable to count links for Category ID $_: ".$count_links_sth->errstr; my $links = $count_links_sth->fetchrow_array; $count_links_sth->finish if ($odbc); $num_links_sth->execute($links,$_) or critical "Unable to update number of links for Category ID $_: ".$num_links_sth->errstr; } } import_print "\nNOTE: You must run Rebuild Cat. tree, Repair Tables, and Rebuild Search after performing an import!\n"; $e_dbh->disconnect; 1; } '"I am lying," said the man. Was he?';