From 4f6d4ef5e763ff7e9f1cc2eda8e73414b8fc82c0 Mon Sep 17 00:00:00 2001 From: saint Date: Mon, 9 Sep 2024 16:13:48 +1000 Subject: [PATCH] v0.1 - Added new script for encoding correction --- goss-correctencoding.rb | 89 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 goss-correctencoding.rb diff --git a/goss-correctencoding.rb b/goss-correctencoding.rb new file mode 100644 index 0000000..d00b22f --- /dev/null +++ b/goss-correctencoding.rb @@ -0,0 +1,89 @@ +# Federated Computer, Inc. +# David Sainty 2024 A.D. +# Gossamer Threads to Discourse -- Correct Encoding +# v0.1 New script + +require 'mysql2' +require 'active_record' +require 'charlock_holmes' + +# require 'concurrent-ruby' +require File.expand_path("../../../../config/environment", __FILE__) +require File.expand_path("../../../../script/import_scripts/base", __FILE__) + +class GossamerForumsCorrectEncoding < ImportScripts::Base + def initialize + super + begin + # Initialize MySQL client to connect to Gossamer Forums database + @mysql_client = Mysql2::Client.new( + host: "slowtwitch.northend.network", + username: "admin", + password: "yxnh93Ybbz2Nm8#mp28zCVv", + database: "slowtwitch" + ) + rescue Mysql2::Error => e + puts "Error connecting to MySQL: #{e.message}" + exit 1 + end + @batch_size = 1000 # Number of posts to process in each batch + end + + # Method to detect and fix text encoding + def fix_text_encoding(text) + # Detect encoding + detection = CharlockHolmes::Detect.detect(text) + original_encoding = detection[:encoding] + puts "Original encoding detected: #{original_encoding}" + + if original_encoding == 'ISO-8859-1' + text.force_encoding('ISO-8859-1').encode('UTF-8') + else + # Try to convert from detected encoding to UTF-8 + text.encode('UTF-8', original_encoding, invalid: :replace, undef: :replace, replace: '?') + end + rescue StandardError => e + puts "Error during encoding conversion: #{e.message}" + text + end + + # Method to fix encoding issues in post content + def fix_encoding + offset = 0 + + loop do + posts = Post.limit(@batch_size).offset(offset) + break if posts.empty? + + posts.each do |post| + raw_content = post.raw + fixed_content = fix_text_encoding(raw_content) + if fixed_content != raw_content + puts "Updating post ##{post.id}" + puts "------- raw_content:\n#{raw_content}" + puts "+++++++ fixed_content:\n#{fixed_content}" + puts "---------------------------------------------------------------------------------------------" + # post.update(raw: fixed_content) + post.raw = fixed_content + if post.save + puts "Post ##{post.id} updated successfully." + else + puts "Failed to update Post ##{post.id}: #{post.errors.full_messages.join(', ')}" + end + end + end + + offset += @batch_size + end + end + + def perform_encoding_correction + puts "Encoding Correction beginning!" + # destroy_deleted_posts_from_gossamer + destroy_deleted_posts_from_gossamer_with_user('spudone') + puts "Encoding Correction complete!" + end +end + +GossamerForumsDestroyDeletedPosts.new.perform_encoding_correction +