# Federated Computer, Inc. # David Sainty 2024 A.D. # Gossamer Threads to Discourse -- Correct Encoding # v0.6 Further attempt to get this reverse dobule encoding right now require 'mysql2' require 'active_record' require 'charlock_holmes' # require 'concurrent-ruby' require File.expand_path("../../../../config/environment", __FILE__) require File.expand_path("../../../../script/import_scripts/base", __FILE__) class GossamerForumsCorrectEncoding < ImportScripts::Base def initialize super begin # Initialize MySQL client to connect to Gossamer Forums database @mysql_client = Mysql2::Client.new( host: "slowtwitch.northend.network", username: "admin", password: "yxnh93Ybbz2Nm8#mp28zCVv", database: "slowtwitch" ) rescue Mysql2::Error => e puts "Error connecting to MySQL: #{e.message}" puts e.backtrace.join("\n") # Print the full stack trace exit 1 end @batch_size = 1000 # Number of posts to process in each batch end # Method to detect and fix text encoding def fix_text_encoding(content) begin # Treat as Windows-1252 (cp1252) and then decode into UTF-8 corrected_content = content.encode('CP1252').force_encoding('UTF-8') rescue Encoding::UndefinedConversionError => e puts "Error during encoding conversion: #{e.message}" puts e.backtrace.join("\n") # Print the full stack trace return content # Return the original content if conversion fails end corrected_content end # Ensure the content is treated as UTF-8 (even if incorrectly encoded) content.force_encoding('UTF-8') # Continue decoding until no more invalid sequences are found previous_content = "" while previous_content != content previous_content = content.dup # Step 1: First attempt to convert from ISO-8859-1 to UTF-8 if content.valid_encoding? # Decode from ISO-8859-1 (or Windows-1252) to UTF-8 content = content.force_encoding('ISO-8859-1').encode('UTF-8', invalid: :replace, undef: :replace, replace: '?') else content = previous_content # Stop if invalid encoding issues arise end end return content end # # Step 1: Try to detect encoding of the corrupted (double-encoded) content # detection = CharlockHolmes::EncodingDetector.detect(broken_content) # original_encoding = detection[:encoding] # puts "Original encoding detected: #{original_encoding}" # # # Step 2: First decode the double-encoded content # begin # # Convert the content assuming it was double-encoded, so decode twice # # First, convert from the detected encoding (ISO-8859-1 or windows-1252) to UTF-8 # first_pass = CharlockHolmes::Converter.convert(broken_content, original_encoding, 'UTF-8') # # # Step 3: Now re-interpret that output as if it's broken UTF-8 and convert it back to UTF-8 # fixed_content = CharlockHolmes::Converter.convert(first_pass, 'UTF-8', 'UTF-8') # # rescue => e # puts "Error during encoding fix: #{e.message}" # puts e.backtrace.join("\n") # Print the full stack trace # # fixed_content = broken_content # Fall back to the broken content if decoding fails # end # # return fixed_content #end # # Detect encoding # detection = CharlockHolmes::EncodingDetector.detect(raw_content) # original_encoding = detection[:encoding] # puts "Original encoding detected: #{original_encoding}" # # Force the encoding to the detected one, then covnert to UTF-8 # if original_encoding == 'ISO-8859-1' || original_encoding == 'windows-1252' # # For Windows-1252 or ISO-8859-1, force the encoding and convert to UTF-8 # # text.force_encoding('ISO-8859-1').encode('UTF-8') # text.force_encoding(original_encoding).encode('UTF-8', invalid: :replace, undef: :replace, replace: '?') # else # # Try to convert from detected encoding to UTF-8 # text.encode('UTF-8', original_encoding, invalid: :replace, undef: :replace, replace: '?') # end # if original_encoding # begin # decoded_content = CharlockHolmes::Converter.convert(raw_content, original_encoding, 'UTF-8') # rescue => e # puts "Error during encoding conversion: #{e.message}" # decoded_content = raw_content # Fall back to raw content if decoding fails # end # else # decoded_content = raw_content # Fallback if encoding detection fails # end # # # Step 3: Ensure the content is now correctly in UTF-8 (no need to encode again) # return decoded_content # end # rescue StandardError => e # puts "Error during encoding conversion: #{e.message}" # puts e.backtrace.join("\n") # Print the full stack trace # text # end # end # Method to fix encoding issues in post content def fix_encoding offset = 0 loop do puts "OFFSET: #{offset}" begin posts = Post.limit(@batch_size).offset(offset) break if posts.empty? posts.each do |post| raw_content = post.raw puts "--> NEXT POST: post.id: #{post.id}" fixed_content = fix_text_encoding(raw_content) if fixed_content != raw_content puts "Updating post #{post.id}" puts "------- raw_content:\n#{raw_content}" puts "+++++++ fixed_content:\n#{fixed_content}" puts "---------------------------------------------------------------------------------------------" # post.update(raw: fixed_content) # post.raw = fixed_content # if post.save # puts "Post ##{post.id} updated successfully." # else # puts "Failed to update Post ##{post.id}: #{post.errors.full_messages.join(', ')}" # end end end rescue puts "Error: #{e.message}" puts e.backtrace.join("\n") # Print the full stack trace end offset += @batch_size end end def perform_encoding_correction puts "Encoding Correction beginning!" fix_encoding puts "Encoding Correction complete!" end end GossamerForumsCorrectEncoding.new.perform_encoding_correction