discourse-import_scripts/goss-correctencoding.rb

# Federated Computer, Inc.
# David Sainty <saint@federated.computer>  2024 A.D.
# Gossamer Threads to Discourse -- Correct Encoding
# v0.6 Further attempt to get this reverse dobule encoding right now

require 'mysql2'
require 'active_record'
require 'charlock_holmes'

# require 'concurrent-ruby'
require File.expand_path("../../../../config/environment", __FILE__)
require File.expand_path("../../../../script/import_scripts/base", __FILE__)

class GossamerForumsCorrectEncoding < ImportScripts::Base
  def initialize
    super
      begin
        # Initialize MySQL client to connect to Gossamer Forums database
        @mysql_client = Mysql2::Client.new(
          host: "slowtwitch.northend.network",
          username: "admin",
          password: "yxnh93Ybbz2Nm8#mp28zCVv",
          database: "slowtwitch"
        )
      rescue Mysql2::Error => e
        puts "Error connecting to MySQL: #{e.message}"
        puts e.backtrace.join("\n")  # Print the full stack trace
        exit 1
      end
      @batch_size = 1000   # Number of posts to process in each batch
  end

  # Method to detect and fix text encoding
  def fix_text_encoding(content)
    begin
      # Treat as Windows-1252 (cp1252) and then decode into UTF-8
      corrected_content = content.encode('CP1252').force_encoding('UTF-8')
    rescue Encoding::UndefinedConversionError => e
      puts "Error during encoding conversion: #{e.message}"
      puts e.backtrace.join("\n")  # Print the full stack trace
      return content # Return the original content if conversion fails
    end

    corrected_content
  end

    # Ensure the content is treated as UTF-8 (even if incorrectly encoded)
    content.force_encoding('UTF-8')
  
    # Continue decoding until no more invalid sequences are found
    previous_content = ""
    while previous_content != content
      previous_content = content.dup
  
      # Step 1: First attempt to convert from ISO-8859-1 to UTF-8
      if content.valid_encoding?
        # Decode from ISO-8859-1 (or Windows-1252) to UTF-8
        content = content.force_encoding('ISO-8859-1').encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
      else
        content = previous_content # Stop if invalid encoding issues arise
      end
    end

  return content
end
    
#    # Step 1: Try to detect encoding of the corrupted (double-encoded) content
#    detection = CharlockHolmes::EncodingDetector.detect(broken_content)
#    original_encoding = detection[:encoding]
#    puts "Original encoding detected: #{original_encoding}"
#  
#    # Step 2: First decode the double-encoded content
#    begin
#      # Convert the content assuming it was double-encoded, so decode twice
#        # First, convert from the detected encoding (ISO-8859-1 or windows-1252) to UTF-8
#      first_pass = CharlockHolmes::Converter.convert(broken_content, original_encoding, 'UTF-8')
#      
#      # Step 3: Now re-interpret that output as if it's broken UTF-8 and convert it back to UTF-8
#      fixed_content = CharlockHolmes::Converter.convert(first_pass, 'UTF-8', 'UTF-8')
#      
#    rescue => e
#      puts "Error during encoding fix: #{e.message}"
#      puts e.backtrace.join("\n")  # Print the full stack trace
#
#      fixed_content = broken_content # Fall back to the broken content if decoding fails
#    end
#  
#    return fixed_content
#end

#    # Detect encoding
#    detection = CharlockHolmes::EncodingDetector.detect(raw_content)
#    original_encoding = detection[:encoding]
#    puts "Original encoding detected: #{original_encoding}"
  
#      # Force the encoding to the detected one, then covnert to UTF-8
#      if original_encoding == 'ISO-8859-1' || original_encoding == 'windows-1252'
#        # For Windows-1252 or ISO-8859-1, force the encoding and convert to UTF-8
#        # text.force_encoding('ISO-8859-1').encode('UTF-8')
#        text.force_encoding(original_encoding).encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
#      else
#        # Try to convert from detected encoding to UTF-8
#        text.encode('UTF-8', original_encoding, invalid: :replace, undef: :replace, replace: '?')
#      end

#    if original_encoding
#      begin
#        decoded_content = CharlockHolmes::Converter.convert(raw_content, original_encoding, 'UTF-8')
#      rescue => e
#        puts "Error during encoding conversion: #{e.message}"
#        decoded_content = raw_content # Fall back to raw content if decoding fails
#      end
#    else
#      decoded_content = raw_content # Fallback if encoding detection fails
#    end
#
#    # Step 3: Ensure the content is now correctly in UTF-8 (no need to encode again)
#    return decoded_content
#  end
        
#    rescue StandardError => e
#      puts "Error during encoding conversion: #{e.message}"
#      puts e.backtrace.join("\n")  # Print the full stack trace
#      text
#    end
#  end

    # Method to fix encoding issues in post content
  def fix_encoding
    offset = 0

    loop do
      puts "OFFSET:  #{offset}"
      begin
        posts = Post.limit(@batch_size).offset(offset)
        break if posts.empty?

        posts.each do |post|
          raw_content = post.raw
          puts "--> NEXT POST:   post.id:  #{post.id}"
          fixed_content = fix_text_encoding(raw_content)
          if fixed_content != raw_content
            puts "Updating post #{post.id}"
            puts "------- raw_content:\n#{raw_content}"
            puts "+++++++ fixed_content:\n#{fixed_content}"
            puts "---------------------------------------------------------------------------------------------"
            # post.update(raw: fixed_content)
#            post.raw = fixed_content
#            if post.save
#              puts "Post ##{post.id} updated successfully."
#            else
#              puts "Failed to update Post ##{post.id}: #{post.errors.full_messages.join(', ')}"
#            end
          end
        end
      rescue
        puts "Error:  #{e.message}"
        puts e.backtrace.join("\n")  # Print the full stack trace
      end

      offset += @batch_size
    end
  end

  def perform_encoding_correction
    puts "Encoding Correction beginning!"
    fix_encoding
    puts "Encoding Correction complete!"
  end
end

GossamerForumsCorrectEncoding.new.perform_encoding_correction
v0.1 - Added new script for encoding correction 2024-09-09 06:13:48 +00:00			`# Federated Computer, Inc.`
			`# David Sainty <saint@federated.computer> 2024 A.D.`
			`# Gossamer Threads to Discourse -- Correct Encoding`
v0.6 Further attempt to get this reverse dobule encoding right now 2024-09-09 10:30:15 +00:00			`# v0.6 Further attempt to get this reverse dobule encoding right now`
v0.1 - Added new script for encoding correction 2024-09-09 06:13:48 +00:00
			`require 'mysql2'`
			`require 'active_record'`
			`require 'charlock_holmes'`

			`# require 'concurrent-ruby'`
			`require File.expand_path("../../../../config/environment", __FILE__)`
			`require File.expand_path("../../../../script/import_scripts/base", __FILE__)`

			`class GossamerForumsCorrectEncoding < ImportScripts::Base`
			`def initialize`
			`super`
			`begin`
			`# Initialize MySQL client to connect to Gossamer Forums database`
			`@mysql_client = Mysql2::Client.new(`
			`host: "slowtwitch.northend.network",`
			`username: "admin",`
			`password: "yxnh93Ybbz2Nm8#mp28zCVv",`
			`database: "slowtwitch"`
			`)`
			`rescue Mysql2::Error => e`
			`puts "Error connecting to MySQL: #{e.message}"`
v0.6 Further attempt to get this reverse dobule encoding right now 2024-09-09 10:30:15 +00:00			`puts e.backtrace.join("\n") # Print the full stack trace`
v0.1 - Added new script for encoding correction 2024-09-09 06:13:48 +00:00			`exit 1`
			`end`
			`@batch_size = 1000 # Number of posts to process in each batch`
			`end`

			`# Method to detect and fix text encoding`
v0.6 Further attempt to get this reverse dobule encoding right now 2024-09-09 10:45:05 +00:00			`def fix_text_encoding(content)`
v0.6 Further attempt to get this reverse dobule encoding right now 2024-09-09 12:11:27 +00:00			`begin`
			`# Treat as Windows-1252 (cp1252) and then decode into UTF-8`
			`corrected_content = content.encode('CP1252').force_encoding('UTF-8')`
			`rescue Encoding::UndefinedConversionError => e`
			`puts "Error during encoding conversion: #{e.message}"`
			`puts e.backtrace.join("\n") # Print the full stack trace`
			`return content # Return the original content if conversion fails`
			`end`

			`corrected_content`
			`end`

v0.6 Further attempt to get this reverse dobule encoding right now 2024-09-09 10:53:10 +00:00			`# Ensure the content is treated as UTF-8 (even if incorrectly encoded)`
			`content.force_encoding('UTF-8')`
v0.2 - Add debugging 2024-09-09 06:29:14 +00:00
v0.6 Further attempt to get this reverse dobule encoding right now 2024-09-09 10:53:10 +00:00			`# Continue decoding until no more invalid sequences are found`
			`previous_content = ""`
			`while previous_content != content`
			`previous_content = content.dup`
v0.6 Further attempt to get this reverse dobule encoding right now 2024-09-09 10:30:15 +00:00
v0.6 Further attempt to get this reverse dobule encoding right now 2024-09-09 10:53:10 +00:00			`# Step 1: First attempt to convert from ISO-8859-1 to UTF-8`
			`if content.valid_encoding?`
			`# Decode from ISO-8859-1 (or Windows-1252) to UTF-8`
			`content = content.force_encoding('ISO-8859-1').encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')`
			`else`
			`content = previous_content # Stop if invalid encoding issues arise`
			`end`
v0.6 Further attempt to get this reverse dobule encoding right now 2024-09-09 10:45:05 +00:00			`end`

v0.6 Further attempt to get this reverse dobule encoding right now 2024-09-09 10:53:10 +00:00			`return content`
v0.6 Further attempt to get this reverse dobule encoding right now 2024-09-09 10:30:15 +00:00			`end`
v0.6 Further attempt to get this reverse dobule encoding right now 2024-09-09 10:45:05 +00:00
			`# # Step 1: Try to detect encoding of the corrupted (double-encoded) content`
			`# detection = CharlockHolmes::EncodingDetector.detect(broken_content)`
			`# original_encoding = detection[:encoding]`
			`# puts "Original encoding detected: #{original_encoding}"`
			`#`
			`# # Step 2: First decode the double-encoded content`
			`# begin`
			`# # Convert the content assuming it was double-encoded, so decode twice`
			`# # First, convert from the detected encoding (ISO-8859-1 or windows-1252) to UTF-8`
			`# first_pass = CharlockHolmes::Converter.convert(broken_content, original_encoding, 'UTF-8')`
			`#`
			`# # Step 3: Now re-interpret that output as if it's broken UTF-8 and convert it back to UTF-8`
			`# fixed_content = CharlockHolmes::Converter.convert(first_pass, 'UTF-8', 'UTF-8')`
			`#`
			`# rescue => e`
			`# puts "Error during encoding fix: #{e.message}"`
			`# puts e.backtrace.join("\n") # Print the full stack trace`
			`#`
			`# fixed_content = broken_content # Fall back to the broken content if decoding fails`
			`# end`
			`#`
			`# return fixed_content`
			`#end`
v0.6 Further attempt to get this reverse dobule encoding right now 2024-09-09 10:30:15 +00:00
			`# # Detect encoding`
			`# detection = CharlockHolmes::EncodingDetector.detect(raw_content)`
			`# original_encoding = detection[:encoding]`
			`# puts "Original encoding detected: #{original_encoding}"`

v0.4 - Add debugging 2024-09-09 07:43:30 +00:00			`# # Force the encoding to the detected one, then covnert to UTF-8`
			`# if original_encoding == 'ISO-8859-1' \|\| original_encoding == 'windows-1252'`
			`# # For Windows-1252 or ISO-8859-1, force the encoding and convert to UTF-8`
			`# # text.force_encoding('ISO-8859-1').encode('UTF-8')`
			`# text.force_encoding(original_encoding).encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')`
			`# else`
			`# # Try to convert from detected encoding to UTF-8`
			`# text.encode('UTF-8', original_encoding, invalid: :replace, undef: :replace, replace: '?')`
			`# end`

v0.6 Further attempt to get this reverse dobule encoding right now 2024-09-09 10:30:15 +00:00			`# if original_encoding`
			`# begin`
			`# decoded_content = CharlockHolmes::Converter.convert(raw_content, original_encoding, 'UTF-8')`
			`# rescue => e`
			`# puts "Error during encoding conversion: #{e.message}"`
			`# decoded_content = raw_content # Fall back to raw content if decoding fails`
			`# end`
			`# else`
			`# decoded_content = raw_content # Fallback if encoding detection fails`
			`# end`
			`#`
			`# # Step 3: Ensure the content is now correctly in UTF-8 (no need to encode again)`
			`# return decoded_content`
			`# end`
v0.4 - Add debugging 2024-09-09 07:43:30 +00:00
			`# rescue StandardError => e`
			`# puts "Error during encoding conversion: #{e.message}"`
			`# puts e.backtrace.join("\n") # Print the full stack trace`
			`# text`
			`# end`
			`# end`
v0.1 - Added new script for encoding correction 2024-09-09 06:13:48 +00:00
			`# Method to fix encoding issues in post content`
			`def fix_encoding`
			`offset = 0`

			`loop do`
v0.2 - Add debugging 2024-09-09 06:29:14 +00:00			`puts "OFFSET: #{offset}"`
			`begin`
			`posts = Post.limit(@batch_size).offset(offset)`
			`break if posts.empty?`
v0.1 - Added new script for encoding correction 2024-09-09 06:13:48 +00:00
v0.2 - Add debugging 2024-09-09 06:29:14 +00:00			`posts.each do \|post\|`
			`raw_content = post.raw`
			`puts "--> NEXT POST: post.id: #{post.id}"`
			`fixed_content = fix_text_encoding(raw_content)`
			`if fixed_content != raw_content`
			`puts "Updating post #{post.id}"`
			`puts "------- raw_content:\n#{raw_content}"`
			`puts "+++++++ fixed_content:\n#{fixed_content}"`
			`puts "---------------------------------------------------------------------------------------------"`
			`# post.update(raw: fixed_content)`
			`# post.raw = fixed_content`
			`# if post.save`
			`# puts "Post ##{post.id} updated successfully."`
			`# else`
			`# puts "Failed to update Post ##{post.id}: #{post.errors.full_messages.join(', ')}"`
			`# end`
			`end`
v0.1 - Added new script for encoding correction 2024-09-09 06:13:48 +00:00			`end`
v0.2 - Add debugging 2024-09-09 06:29:14 +00:00			`rescue`
			`puts "Error: #{e.message}"`
			`puts e.backtrace.join("\n") # Print the full stack trace`
v0.1 - Added new script for encoding correction 2024-09-09 06:13:48 +00:00			`end`

			`offset += @batch_size`
			`end`
			`end`

			`def perform_encoding_correction`
			`puts "Encoding Correction beginning!"`
v0.3 - Add debugging 2024-09-09 06:37:26 +00:00			`fix_encoding`
v0.1 - Added new script for encoding correction 2024-09-09 06:13:48 +00:00			`puts "Encoding Correction complete!"`
			`end`
			`end`

v0.1 - Added new script for encoding correction 2024-09-09 06:15:44 +00:00			`GossamerForumsCorrectEncoding.new.perform_encoding_correction`
v0.1 - Added new script for encoding correction 2024-09-09 06:13:48 +00:00