# Federated Computer, Inc. # David Sainty 2024 A.D. # Gossamer Threads to Discourse -- Correct Encoding # v0.4 Debugging require 'mysql2' require 'active_record' require 'charlock_holmes' # require 'concurrent-ruby' require File.expand_path("../../../../config/environment", __FILE__) require File.expand_path("../../../../script/import_scripts/base", __FILE__) class GossamerForumsCorrectEncoding < ImportScripts::Base def initialize super begin # Initialize MySQL client to connect to Gossamer Forums database @mysql_client = Mysql2::Client.new( host: "slowtwitch.northend.network", username: "admin", password: "yxnh93Ybbz2Nm8#mp28zCVv", database: "slowtwitch" ) rescue Mysql2::Error => e puts "Error connecting to MySQL: #{e.message}" exit 1 end @batch_size = 1000 # Number of posts to process in each batch end # Method to detect and fix text encoding def fix_text_encoding(text) # Detect encoding detection = CharlockHolmes::EncodingDetector.detect(text) original_encoding = detection[:encoding] puts "Original encoding detected: #{original_encoding}" # # Force the encoding to the detected one, then covnert to UTF-8 # if original_encoding == 'ISO-8859-1' || original_encoding == 'windows-1252' # # For Windows-1252 or ISO-8859-1, force the encoding and convert to UTF-8 # # text.force_encoding('ISO-8859-1').encode('UTF-8') # text.force_encoding(original_encoding).encode('UTF-8', invalid: :replace, undef: :replace, replace: '?') # else # # Try to convert from detected encoding to UTF-8 # text.encode('UTF-8', original_encoding, invalid: :replace, undef: :replace, replace: '?') # end if original_encoding begin decoded_content = CharlockHolmes::Converter.convert(raw_content, original_encoding, 'UTF-8') rescue => e puts "Error during encoding conversion: #{e.message}" decoded_content = raw_content # Fall back to raw content if decoding fails end else decoded_content = raw_content # Fallback if encoding detection fails end # Step 3: Ensure the content is now correctly in UTF-8 (no need to encode again) return decoded_content end # rescue StandardError => e # puts "Error during encoding conversion: #{e.message}" # puts e.backtrace.join("\n") # Print the full stack trace # text # end # end # Method to fix encoding issues in post content def fix_encoding offset = 0 loop do puts "OFFSET: #{offset}" begin posts = Post.limit(@batch_size).offset(offset) break if posts.empty? posts.each do |post| raw_content = post.raw puts "--> NEXT POST: post.id: #{post.id}" fixed_content = fix_text_encoding(raw_content) if fixed_content != raw_content puts "Updating post #{post.id}" puts "------- raw_content:\n#{raw_content}" puts "+++++++ fixed_content:\n#{fixed_content}" puts "---------------------------------------------------------------------------------------------" # post.update(raw: fixed_content) # post.raw = fixed_content # if post.save # puts "Post ##{post.id} updated successfully." # else # puts "Failed to update Post ##{post.id}: #{post.errors.full_messages.join(', ')}" # end end end rescue puts "Error: #{e.message}" puts e.backtrace.join("\n") # Print the full stack trace end offset += @batch_size end end def perform_encoding_correction puts "Encoding Correction beginning!" fix_encoding puts "Encoding Correction complete!" end end GossamerForumsCorrectEncoding.new.perform_encoding_correction