From 538185095dc1b946891d4557db3458765f94311b Mon Sep 17 00:00:00 2001 From: saint Date: Mon, 9 Sep 2024 16:29:14 +1000 Subject: [PATCH] v0.2 - Add debugging --- goss-correctencoding.rb | 73 +++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 32 deletions(-) diff --git a/goss-correctencoding.rb b/goss-correctencoding.rb index 7a3f833..5c7edcf 100644 --- a/goss-correctencoding.rb +++ b/goss-correctencoding.rb @@ -1,7 +1,7 @@ # Federated Computer, Inc. # David Sainty 2024 A.D. # Gossamer Threads to Discourse -- Correct Encoding -# v0.1 New script +# v0.2 Debugging require 'mysql2' require 'active_record' @@ -31,20 +31,22 @@ class GossamerForumsCorrectEncoding < ImportScripts::Base # Method to detect and fix text encoding def fix_text_encoding(text) - # Detect encoding - detection = CharlockHolmes::Detect.detect(text) - original_encoding = detection[:encoding] - puts "Original encoding detected: #{original_encoding}" - - if original_encoding == 'ISO-8859-1' - text.force_encoding('ISO-8859-1').encode('UTF-8') - else - # Try to convert from detected encoding to UTF-8 - text.encode('UTF-8', original_encoding, invalid: :replace, undef: :replace, replace: '?') - end - rescue StandardError => e - puts "Error during encoding conversion: #{e.message}" - text + begin + # Detect encoding + detection = CharlockHolmes::Detect.detect(text) + original_encoding = detection[:encoding] + puts "Original encoding detected: #{original_encoding}" + + if original_encoding == 'ISO-8859-1' + text.force_encoding('ISO-8859-1').encode('UTF-8') + else + # Try to convert from detected encoding to UTF-8 + text.encode('UTF-8', original_encoding, invalid: :replace, undef: :replace, replace: '?') + end + rescue StandardError => e + puts "Error during encoding conversion: #{e.message}" + puts e.backtrace.join("\n") # Print the full stack trace + text end # Method to fix encoding issues in post content @@ -52,25 +54,32 @@ class GossamerForumsCorrectEncoding < ImportScripts::Base offset = 0 loop do - posts = Post.limit(@batch_size).offset(offset) - break if posts.empty? + puts "OFFSET: #{offset}" + begin + posts = Post.limit(@batch_size).offset(offset) + break if posts.empty? - posts.each do |post| - raw_content = post.raw - fixed_content = fix_text_encoding(raw_content) - if fixed_content != raw_content - puts "Updating post ##{post.id}" - puts "------- raw_content:\n#{raw_content}" - puts "+++++++ fixed_content:\n#{fixed_content}" - puts "---------------------------------------------------------------------------------------------" - # post.update(raw: fixed_content) -# post.raw = fixed_content -# if post.save -# puts "Post ##{post.id} updated successfully." -# else -# puts "Failed to update Post ##{post.id}: #{post.errors.full_messages.join(', ')}" -# end + posts.each do |post| + raw_content = post.raw + puts "--> NEXT POST: post.id: #{post.id}" + fixed_content = fix_text_encoding(raw_content) + if fixed_content != raw_content + puts "Updating post #{post.id}" + puts "------- raw_content:\n#{raw_content}" + puts "+++++++ fixed_content:\n#{fixed_content}" + puts "---------------------------------------------------------------------------------------------" + # post.update(raw: fixed_content) +# post.raw = fixed_content +# if post.save +# puts "Post ##{post.id} updated successfully." +# else +# puts "Failed to update Post ##{post.id}: #{post.errors.full_messages.join(', ')}" +# end + end end + rescue + puts "Error: #{e.message}" + puts e.backtrace.join("\n") # Print the full stack trace end offset += @batch_size