From ad518a151299229b23f0bc62d5784e86fa4ee705 Mon Sep 17 00:00:00 2001 From: saint Date: Mon, 9 Sep 2024 20:53:10 +1000 Subject: [PATCH] v0.6 Further attempt to get this reverse dobule encoding right now --- goss-correctencoding.rb | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/goss-correctencoding.rb b/goss-correctencoding.rb index f1e2ef6..97d8af0 100644 --- a/goss-correctencoding.rb +++ b/goss-correctencoding.rb @@ -32,33 +32,26 @@ class GossamerForumsCorrectEncoding < ImportScripts::Base # Method to detect and fix text encoding def fix_text_encoding(content) - # Detect if content is already UTF-8 (should be the target encoding) - if content.encoding == Encoding::UTF_8 && content.valid_encoding? - # Return as-is if it is already properly encoded - return content - end + # Ensure the content is treated as UTF-8 (even if incorrectly encoded) + content.force_encoding('UTF-8') - # Step 1: Assume the content was incorrectly encoded as ISO-8859-1 (Windows-1252) and needs to be corrected - begin - # Convert from ISO-8859-1 (or Windows-1252) back to UTF-8 - # Force the encoding to ISO-8859-1 first, then encode to UTF-8 properly - fixed_content = content.force_encoding("ISO-8859-1").encode("UTF-8") - rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError - # If conversion fails, leave it as is - return content - end + # Continue decoding until no more invalid sequences are found + previous_content = "" + while previous_content != content + previous_content = content.dup - # Step 2: After reversing the encoding once, check if the result is valid UTF-8 - # If it's still not valid UTF-8, force it back and try again - unless fixed_content.valid_encoding? - fixed_content = fixed_content.force_encoding("ISO-8859-1").encode("UTF-8") + # Step 1: First attempt to convert from ISO-8859-1 to UTF-8 + if content.valid_encoding? + # Decode from ISO-8859-1 (or Windows-1252) to UTF-8 + content = content.force_encoding('ISO-8859-1').encode('UTF-8', invalid: :replace, undef: :replace, replace: '?') + else + content = previous_content # Stop if invalid encoding issues arise + end end - # Return the properly decoded content - return fixed_content + return content end - # # Step 1: Try to detect encoding of the corrupted (double-encoded) content # detection = CharlockHolmes::EncodingDetector.detect(broken_content) # original_encoding = detection[:encoding]