90 lines
2.8 KiB
Ruby
90 lines
2.8 KiB
Ruby
|
# Federated Computer, Inc.
|
||
|
# David Sainty <saint@federated.computer> 2024 A.D.
|
||
|
# Gossamer Threads to Discourse -- Correct Encoding
|
||
|
# v0.1 New script
|
||
|
|
||
|
require 'mysql2'
|
||
|
require 'active_record'
|
||
|
require 'charlock_holmes'
|
||
|
|
||
|
# require 'concurrent-ruby'
|
||
|
require File.expand_path("../../../../config/environment", __FILE__)
|
||
|
require File.expand_path("../../../../script/import_scripts/base", __FILE__)
|
||
|
|
||
|
class GossamerForumsCorrectEncoding < ImportScripts::Base
|
||
|
def initialize
|
||
|
super
|
||
|
begin
|
||
|
# Initialize MySQL client to connect to Gossamer Forums database
|
||
|
@mysql_client = Mysql2::Client.new(
|
||
|
host: "slowtwitch.northend.network",
|
||
|
username: "admin",
|
||
|
password: "yxnh93Ybbz2Nm8#mp28zCVv",
|
||
|
database: "slowtwitch"
|
||
|
)
|
||
|
rescue Mysql2::Error => e
|
||
|
puts "Error connecting to MySQL: #{e.message}"
|
||
|
exit 1
|
||
|
end
|
||
|
@batch_size = 1000 # Number of posts to process in each batch
|
||
|
end
|
||
|
|
||
|
# Method to detect and fix text encoding
|
||
|
def fix_text_encoding(text)
|
||
|
# Detect encoding
|
||
|
detection = CharlockHolmes::Detect.detect(text)
|
||
|
original_encoding = detection[:encoding]
|
||
|
puts "Original encoding detected: #{original_encoding}"
|
||
|
|
||
|
if original_encoding == 'ISO-8859-1'
|
||
|
text.force_encoding('ISO-8859-1').encode('UTF-8')
|
||
|
else
|
||
|
# Try to convert from detected encoding to UTF-8
|
||
|
text.encode('UTF-8', original_encoding, invalid: :replace, undef: :replace, replace: '?')
|
||
|
end
|
||
|
rescue StandardError => e
|
||
|
puts "Error during encoding conversion: #{e.message}"
|
||
|
text
|
||
|
end
|
||
|
|
||
|
# Method to fix encoding issues in post content
|
||
|
def fix_encoding
|
||
|
offset = 0
|
||
|
|
||
|
loop do
|
||
|
posts = Post.limit(@batch_size).offset(offset)
|
||
|
break if posts.empty?
|
||
|
|
||
|
posts.each do |post|
|
||
|
raw_content = post.raw
|
||
|
fixed_content = fix_text_encoding(raw_content)
|
||
|
if fixed_content != raw_content
|
||
|
puts "Updating post ##{post.id}"
|
||
|
puts "------- raw_content:\n#{raw_content}"
|
||
|
puts "+++++++ fixed_content:\n#{fixed_content}"
|
||
|
puts "---------------------------------------------------------------------------------------------"
|
||
|
# post.update(raw: fixed_content)
|
||
|
post.raw = fixed_content
|
||
|
if post.save
|
||
|
puts "Post ##{post.id} updated successfully."
|
||
|
else
|
||
|
puts "Failed to update Post ##{post.id}: #{post.errors.full_messages.join(', ')}"
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
|
||
|
offset += @batch_size
|
||
|
end
|
||
|
end
|
||
|
|
||
|
def perform_encoding_correction
|
||
|
puts "Encoding Correction beginning!"
|
||
|
# destroy_deleted_posts_from_gossamer
|
||
|
destroy_deleted_posts_from_gossamer_with_user('spudone')
|
||
|
puts "Encoding Correction complete!"
|
||
|
end
|
||
|
end
|
||
|
|
||
|
GossamerForumsDestroyDeletedPosts.new.perform_encoding_correction
|
||
|
|