2024-09-09 06:13:48 +00:00
|
|
|
# Federated Computer, Inc.
|
|
|
|
# David Sainty <saint@federated.computer> 2024 A.D.
|
|
|
|
# Gossamer Threads to Discourse -- Correct Encoding
|
2024-09-09 10:30:15 +00:00
|
|
|
# v0.6 Further attempt to get this reverse dobule encoding right now
|
2024-09-09 06:13:48 +00:00
|
|
|
|
|
|
|
require 'mysql2'
|
|
|
|
require 'active_record'
|
|
|
|
require 'charlock_holmes'
|
|
|
|
|
|
|
|
# require 'concurrent-ruby'
|
|
|
|
require File.expand_path("../../../../config/environment", __FILE__)
|
|
|
|
require File.expand_path("../../../../script/import_scripts/base", __FILE__)
|
|
|
|
|
|
|
|
class GossamerForumsCorrectEncoding < ImportScripts::Base
|
|
|
|
def initialize
|
|
|
|
super
|
|
|
|
begin
|
|
|
|
# Initialize MySQL client to connect to Gossamer Forums database
|
|
|
|
@mysql_client = Mysql2::Client.new(
|
|
|
|
host: "slowtwitch.northend.network",
|
|
|
|
username: "admin",
|
|
|
|
password: "yxnh93Ybbz2Nm8#mp28zCVv",
|
|
|
|
database: "slowtwitch"
|
|
|
|
)
|
|
|
|
rescue Mysql2::Error => e
|
|
|
|
puts "Error connecting to MySQL: #{e.message}"
|
2024-09-09 10:30:15 +00:00
|
|
|
puts e.backtrace.join("\n") # Print the full stack trace
|
2024-09-09 06:13:48 +00:00
|
|
|
exit 1
|
|
|
|
end
|
|
|
|
@batch_size = 1000 # Number of posts to process in each batch
|
|
|
|
end
|
|
|
|
|
|
|
|
# Method to detect and fix text encoding
|
2024-09-09 10:45:05 +00:00
|
|
|
def fix_text_encoding(content)
|
|
|
|
# Detect if content is already UTF-8 (should be the target encoding)
|
|
|
|
if content.encoding == Encoding::UTF_8 && content.valid_encoding?
|
|
|
|
# Return as-is if it is already properly encoded
|
|
|
|
return content
|
|
|
|
end
|
2024-09-09 06:29:14 +00:00
|
|
|
|
2024-09-09 10:45:05 +00:00
|
|
|
# Step 1: Assume the content was incorrectly encoded as ISO-8859-1 (Windows-1252) and needs to be corrected
|
2024-09-09 10:30:15 +00:00
|
|
|
begin
|
2024-09-09 10:45:05 +00:00
|
|
|
# Convert from ISO-8859-1 (or Windows-1252) back to UTF-8
|
|
|
|
# Force the encoding to ISO-8859-1 first, then encode to UTF-8 properly
|
|
|
|
fixed_content = content.force_encoding("ISO-8859-1").encode("UTF-8")
|
|
|
|
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
|
|
|
# If conversion fails, leave it as is
|
|
|
|
return content
|
2024-09-09 10:30:15 +00:00
|
|
|
end
|
|
|
|
|
2024-09-09 10:45:05 +00:00
|
|
|
# Step 2: After reversing the encoding once, check if the result is valid UTF-8
|
|
|
|
# If it's still not valid UTF-8, force it back and try again
|
|
|
|
unless fixed_content.valid_encoding?
|
|
|
|
fixed_content = fixed_content.force_encoding("ISO-8859-1").encode("UTF-8")
|
|
|
|
end
|
|
|
|
|
|
|
|
# Return the properly decoded content
|
|
|
|
return fixed_content
|
2024-09-09 10:30:15 +00:00
|
|
|
end
|
2024-09-09 10:45:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
# # Step 1: Try to detect encoding of the corrupted (double-encoded) content
|
|
|
|
# detection = CharlockHolmes::EncodingDetector.detect(broken_content)
|
|
|
|
# original_encoding = detection[:encoding]
|
|
|
|
# puts "Original encoding detected: #{original_encoding}"
|
|
|
|
#
|
|
|
|
# # Step 2: First decode the double-encoded content
|
|
|
|
# begin
|
|
|
|
# # Convert the content assuming it was double-encoded, so decode twice
|
|
|
|
# # First, convert from the detected encoding (ISO-8859-1 or windows-1252) to UTF-8
|
|
|
|
# first_pass = CharlockHolmes::Converter.convert(broken_content, original_encoding, 'UTF-8')
|
|
|
|
#
|
|
|
|
# # Step 3: Now re-interpret that output as if it's broken UTF-8 and convert it back to UTF-8
|
|
|
|
# fixed_content = CharlockHolmes::Converter.convert(first_pass, 'UTF-8', 'UTF-8')
|
|
|
|
#
|
|
|
|
# rescue => e
|
|
|
|
# puts "Error during encoding fix: #{e.message}"
|
|
|
|
# puts e.backtrace.join("\n") # Print the full stack trace
|
|
|
|
#
|
|
|
|
# fixed_content = broken_content # Fall back to the broken content if decoding fails
|
|
|
|
# end
|
|
|
|
#
|
|
|
|
# return fixed_content
|
|
|
|
#end
|
2024-09-09 10:30:15 +00:00
|
|
|
|
|
|
|
# # Detect encoding
|
|
|
|
# detection = CharlockHolmes::EncodingDetector.detect(raw_content)
|
|
|
|
# original_encoding = detection[:encoding]
|
|
|
|
# puts "Original encoding detected: #{original_encoding}"
|
|
|
|
|
2024-09-09 07:43:30 +00:00
|
|
|
# # Force the encoding to the detected one, then covnert to UTF-8
|
|
|
|
# if original_encoding == 'ISO-8859-1' || original_encoding == 'windows-1252'
|
|
|
|
# # For Windows-1252 or ISO-8859-1, force the encoding and convert to UTF-8
|
|
|
|
# # text.force_encoding('ISO-8859-1').encode('UTF-8')
|
|
|
|
# text.force_encoding(original_encoding).encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
|
|
|
# else
|
|
|
|
# # Try to convert from detected encoding to UTF-8
|
|
|
|
# text.encode('UTF-8', original_encoding, invalid: :replace, undef: :replace, replace: '?')
|
|
|
|
# end
|
|
|
|
|
2024-09-09 10:30:15 +00:00
|
|
|
# if original_encoding
|
|
|
|
# begin
|
|
|
|
# decoded_content = CharlockHolmes::Converter.convert(raw_content, original_encoding, 'UTF-8')
|
|
|
|
# rescue => e
|
|
|
|
# puts "Error during encoding conversion: #{e.message}"
|
|
|
|
# decoded_content = raw_content # Fall back to raw content if decoding fails
|
|
|
|
# end
|
|
|
|
# else
|
|
|
|
# decoded_content = raw_content # Fallback if encoding detection fails
|
|
|
|
# end
|
|
|
|
#
|
|
|
|
# # Step 3: Ensure the content is now correctly in UTF-8 (no need to encode again)
|
|
|
|
# return decoded_content
|
|
|
|
# end
|
2024-09-09 07:43:30 +00:00
|
|
|
|
|
|
|
# rescue StandardError => e
|
|
|
|
# puts "Error during encoding conversion: #{e.message}"
|
|
|
|
# puts e.backtrace.join("\n") # Print the full stack trace
|
|
|
|
# text
|
|
|
|
# end
|
|
|
|
# end
|
2024-09-09 06:13:48 +00:00
|
|
|
|
|
|
|
# Method to fix encoding issues in post content
|
|
|
|
def fix_encoding
|
|
|
|
offset = 0
|
|
|
|
|
|
|
|
loop do
|
2024-09-09 06:29:14 +00:00
|
|
|
puts "OFFSET: #{offset}"
|
|
|
|
begin
|
|
|
|
posts = Post.limit(@batch_size).offset(offset)
|
|
|
|
break if posts.empty?
|
2024-09-09 06:13:48 +00:00
|
|
|
|
2024-09-09 06:29:14 +00:00
|
|
|
posts.each do |post|
|
|
|
|
raw_content = post.raw
|
|
|
|
puts "--> NEXT POST: post.id: #{post.id}"
|
|
|
|
fixed_content = fix_text_encoding(raw_content)
|
|
|
|
if fixed_content != raw_content
|
|
|
|
puts "Updating post #{post.id}"
|
|
|
|
puts "------- raw_content:\n#{raw_content}"
|
|
|
|
puts "+++++++ fixed_content:\n#{fixed_content}"
|
|
|
|
puts "---------------------------------------------------------------------------------------------"
|
|
|
|
# post.update(raw: fixed_content)
|
|
|
|
# post.raw = fixed_content
|
|
|
|
# if post.save
|
|
|
|
# puts "Post ##{post.id} updated successfully."
|
|
|
|
# else
|
|
|
|
# puts "Failed to update Post ##{post.id}: #{post.errors.full_messages.join(', ')}"
|
|
|
|
# end
|
|
|
|
end
|
2024-09-09 06:13:48 +00:00
|
|
|
end
|
2024-09-09 06:29:14 +00:00
|
|
|
rescue
|
|
|
|
puts "Error: #{e.message}"
|
|
|
|
puts e.backtrace.join("\n") # Print the full stack trace
|
2024-09-09 06:13:48 +00:00
|
|
|
end
|
|
|
|
|
|
|
|
offset += @batch_size
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
def perform_encoding_correction
|
|
|
|
puts "Encoding Correction beginning!"
|
2024-09-09 06:37:26 +00:00
|
|
|
fix_encoding
|
2024-09-09 06:13:48 +00:00
|
|
|
puts "Encoding Correction complete!"
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2024-09-09 06:15:44 +00:00
|
|
|
GossamerForumsCorrectEncoding.new.perform_encoding_correction
|
2024-09-09 06:13:48 +00:00
|
|
|
|