From 5f3708b3075083916dfd01eb95087874c322dbb7 Mon Sep 17 00:00:00 2001 From: saint Date: Mon, 9 Sep 2024 20:30:15 +1000 Subject: [PATCH] v0.6 Further attempt to get this reverse dobule encoding right now --- goss-correctencoding.rb | 61 +++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/goss-correctencoding.rb b/goss-correctencoding.rb index 2734abf..4d2dec2 100644 --- a/goss-correctencoding.rb +++ b/goss-correctencoding.rb @@ -1,7 +1,7 @@ # Federated Computer, Inc. # David Sainty 2024 A.D. # Gossamer Threads to Discourse -- Correct Encoding -# v0.4 Debugging +# v0.6 Further attempt to get this reverse dobule encoding right now require 'mysql2' require 'active_record' @@ -24,18 +24,43 @@ class GossamerForumsCorrectEncoding < ImportScripts::Base ) rescue Mysql2::Error => e puts "Error connecting to MySQL: #{e.message}" + puts e.backtrace.join("\n") # Print the full stack trace exit 1 end @batch_size = 1000 # Number of posts to process in each batch end # Method to detect and fix text encoding - def fix_text_encoding(raw_content) - # Detect encoding - detection = CharlockHolmes::EncodingDetector.detect(raw_content) + def fix_text_encoding(broken_content) + # Step 1: Try to detect encoding of the corrupted (double-encoded) content + detection = CharlockHolmes::EncodingDetector.detect(broken_content) original_encoding = detection[:encoding] puts "Original encoding detected: #{original_encoding}" + # Step 2: First decode the double-encoded content + begin + # Convert the content assuming it was double-encoded, so decode twice + # First, convert from the detected encoding (ISO-8859-1 or windows-1252) to UTF-8 + first_pass = CharlockHolmes::Converter.convert(broken_content, original_encoding, 'UTF-8') + + # Step 3: Now re-interpret that output as if it's broken UTF-8 and convert it back to UTF-8 + fixed_content = CharlockHolmes::Converter.convert(first_pass, 'UTF-8', 'UTF-8') + + rescue => e + puts "Error during encoding fix: #{e.message}" + puts e.backtrace.join("\n") # Print the full stack trace + + fixed_content = broken_content # Fall back to the broken content if decoding fails + end + + return fixed_content +end + +# # Detect encoding +# detection = CharlockHolmes::EncodingDetector.detect(raw_content) +# original_encoding = detection[:encoding] +# puts "Original encoding detected: #{original_encoding}" + # # Force the encoding to the detected one, then covnert to UTF-8 # if original_encoding == 'ISO-8859-1' || original_encoding == 'windows-1252' # # For Windows-1252 or ISO-8859-1, force the encoding and convert to UTF-8 @@ -46,20 +71,20 @@ class GossamerForumsCorrectEncoding < ImportScripts::Base # text.encode('UTF-8', original_encoding, invalid: :replace, undef: :replace, replace: '?') # end - if original_encoding - begin - decoded_content = CharlockHolmes::Converter.convert(raw_content, original_encoding, 'UTF-8') - rescue => e - puts "Error during encoding conversion: #{e.message}" - decoded_content = raw_content # Fall back to raw content if decoding fails - end - else - decoded_content = raw_content # Fallback if encoding detection fails - end - - # Step 3: Ensure the content is now correctly in UTF-8 (no need to encode again) - return decoded_content - end +# if original_encoding +# begin +# decoded_content = CharlockHolmes::Converter.convert(raw_content, original_encoding, 'UTF-8') +# rescue => e +# puts "Error during encoding conversion: #{e.message}" +# decoded_content = raw_content # Fall back to raw content if decoding fails +# end +# else +# decoded_content = raw_content # Fallback if encoding detection fails +# end +# +# # Step 3: Ensure the content is now correctly in UTF-8 (no need to encode again) +# return decoded_content +# end # rescue StandardError => e # puts "Error during encoding conversion: #{e.message}"