discourse-import_scripts/goss-correctencoding.rb

117 lines
3.9 KiB
Ruby
Raw Normal View History

# Federated Computer, Inc.
# David Sainty <saint@federated.computer> 2024 A.D.
# Gossamer Threads to Discourse -- Correct Encoding
2024-09-09 07:30:37 +00:00
# v0.4 Debugging
require 'mysql2'
require 'active_record'
require 'charlock_holmes'
# require 'concurrent-ruby'
require File.expand_path("../../../../config/environment", __FILE__)
require File.expand_path("../../../../script/import_scripts/base", __FILE__)
class GossamerForumsCorrectEncoding < ImportScripts::Base
def initialize
super
begin
# Initialize MySQL client to connect to Gossamer Forums database
@mysql_client = Mysql2::Client.new(
host: "slowtwitch.northend.network",
username: "admin",
password: "yxnh93Ybbz2Nm8#mp28zCVv",
database: "slowtwitch"
)
rescue Mysql2::Error => e
puts "Error connecting to MySQL: #{e.message}"
exit 1
end
@batch_size = 1000 # Number of posts to process in each batch
end
# Method to detect and fix text encoding
def fix_text_encoding(text)
2024-09-09 07:43:30 +00:00
# Detect encoding
detection = CharlockHolmes::EncodingDetector.detect(text)
original_encoding = detection[:encoding]
puts "Original encoding detected: #{original_encoding}"
2024-09-09 06:29:14 +00:00
2024-09-09 07:43:30 +00:00
# # Force the encoding to the detected one, then covnert to UTF-8
# if original_encoding == 'ISO-8859-1' || original_encoding == 'windows-1252'
# # For Windows-1252 or ISO-8859-1, force the encoding and convert to UTF-8
# # text.force_encoding('ISO-8859-1').encode('UTF-8')
# text.force_encoding(original_encoding).encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
# else
# # Try to convert from detected encoding to UTF-8
# text.encode('UTF-8', original_encoding, invalid: :replace, undef: :replace, replace: '?')
# end
if original_encoding
begin
decoded_content = CharlockHolmes::Converter.convert(raw_content, original_encoding, 'UTF-8')
rescue => e
puts "Error during encoding conversion: #{e.message}"
decoded_content = raw_content # Fall back to raw content if decoding fails
2024-09-09 06:29:14 +00:00
end
2024-09-09 07:43:30 +00:00
else
decoded_content = raw_content # Fallback if encoding detection fails
2024-09-09 06:32:51 +00:00
end
2024-09-09 07:43:30 +00:00
# Step 3: Ensure the content is now correctly in UTF-8 (no need to encode again)
return decoded_content
end
2024-09-09 07:43:30 +00:00
# rescue StandardError => e
# puts "Error during encoding conversion: #{e.message}"
# puts e.backtrace.join("\n") # Print the full stack trace
# text
# end
# end
# Method to fix encoding issues in post content
def fix_encoding
offset = 0
loop do
2024-09-09 06:29:14 +00:00
puts "OFFSET: #{offset}"
begin
posts = Post.limit(@batch_size).offset(offset)
break if posts.empty?
2024-09-09 06:29:14 +00:00
posts.each do |post|
raw_content = post.raw
puts "--> NEXT POST: post.id: #{post.id}"
fixed_content = fix_text_encoding(raw_content)
if fixed_content != raw_content
puts "Updating post #{post.id}"
puts "------- raw_content:\n#{raw_content}"
puts "+++++++ fixed_content:\n#{fixed_content}"
puts "---------------------------------------------------------------------------------------------"
# post.update(raw: fixed_content)
# post.raw = fixed_content
# if post.save
# puts "Post ##{post.id} updated successfully."
# else
# puts "Failed to update Post ##{post.id}: #{post.errors.full_messages.join(', ')}"
# end
end
end
2024-09-09 06:29:14 +00:00
rescue
puts "Error: #{e.message}"
puts e.backtrace.join("\n") # Print the full stack trace
end
offset += @batch_size
end
end
def perform_encoding_correction
puts "Encoding Correction beginning!"
2024-09-09 06:37:26 +00:00
fix_encoding
puts "Encoding Correction complete!"
end
end
GossamerForumsCorrectEncoding.new.perform_encoding_correction