From 9dd478697ae3a8950db85901acfb3e5637a7fec4 Mon Sep 17 00:00:00 2001 From: saint Date: Mon, 9 Sep 2024 20:45:05 +1000 Subject: [PATCH] v0.6 Further attempt to get this reverse dobule encoding right now --- goss-correctencoding.rb | 69 ++++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 21 deletions(-) diff --git a/goss-correctencoding.rb b/goss-correctencoding.rb index 4d2dec2..f1e2ef6 100644 --- a/goss-correctencoding.rb +++ b/goss-correctencoding.rb @@ -31,30 +31,57 @@ class GossamerForumsCorrectEncoding < ImportScripts::Base end # Method to detect and fix text encoding - def fix_text_encoding(broken_content) - # Step 1: Try to detect encoding of the corrupted (double-encoded) content - detection = CharlockHolmes::EncodingDetector.detect(broken_content) - original_encoding = detection[:encoding] - puts "Original encoding detected: #{original_encoding}" - - # Step 2: First decode the double-encoded content - begin - # Convert the content assuming it was double-encoded, so decode twice - # First, convert from the detected encoding (ISO-8859-1 or windows-1252) to UTF-8 - first_pass = CharlockHolmes::Converter.convert(broken_content, original_encoding, 'UTF-8') - - # Step 3: Now re-interpret that output as if it's broken UTF-8 and convert it back to UTF-8 - fixed_content = CharlockHolmes::Converter.convert(first_pass, 'UTF-8', 'UTF-8') - - rescue => e - puts "Error during encoding fix: #{e.message}" - puts e.backtrace.join("\n") # Print the full stack trace - - fixed_content = broken_content # Fall back to the broken content if decoding fails + def fix_text_encoding(content) + # Detect if content is already UTF-8 (should be the target encoding) + if content.encoding == Encoding::UTF_8 && content.valid_encoding? + # Return as-is if it is already properly encoded + return content end - return fixed_content + # Step 1: Assume the content was incorrectly encoded as ISO-8859-1 (Windows-1252) and needs to be corrected + begin + # Convert from ISO-8859-1 (or Windows-1252) back to UTF-8 + # Force the encoding to ISO-8859-1 first, then encode to UTF-8 properly + fixed_content = content.force_encoding("ISO-8859-1").encode("UTF-8") + rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError + # If conversion fails, leave it as is + return content + end + + # Step 2: After reversing the encoding once, check if the result is valid UTF-8 + # If it's still not valid UTF-8, force it back and try again + unless fixed_content.valid_encoding? + fixed_content = fixed_content.force_encoding("ISO-8859-1").encode("UTF-8") + end + + # Return the properly decoded content + return fixed_content end + + +# # Step 1: Try to detect encoding of the corrupted (double-encoded) content +# detection = CharlockHolmes::EncodingDetector.detect(broken_content) +# original_encoding = detection[:encoding] +# puts "Original encoding detected: #{original_encoding}" +# +# # Step 2: First decode the double-encoded content +# begin +# # Convert the content assuming it was double-encoded, so decode twice +# # First, convert from the detected encoding (ISO-8859-1 or windows-1252) to UTF-8 +# first_pass = CharlockHolmes::Converter.convert(broken_content, original_encoding, 'UTF-8') +# +# # Step 3: Now re-interpret that output as if it's broken UTF-8 and convert it back to UTF-8 +# fixed_content = CharlockHolmes::Converter.convert(first_pass, 'UTF-8', 'UTF-8') +# +# rescue => e +# puts "Error during encoding fix: #{e.message}" +# puts e.backtrace.join("\n") # Print the full stack trace +# +# fixed_content = broken_content # Fall back to the broken content if decoding fails +# end +# +# return fixed_content +#end # # Detect encoding # detection = CharlockHolmes::EncodingDetector.detect(raw_content)