v0.6 Further attempt to get this reverse dobule encoding right now

This commit is contained in:
David Sainty 2024-09-09 20:53:10 +10:00
parent 9dd478697a
commit ad518a1512

View File

@ -32,33 +32,26 @@ class GossamerForumsCorrectEncoding < ImportScripts::Base
# Method to detect and fix text encoding # Method to detect and fix text encoding
def fix_text_encoding(content) def fix_text_encoding(content)
# Detect if content is already UTF-8 (should be the target encoding) # Ensure the content is treated as UTF-8 (even if incorrectly encoded)
if content.encoding == Encoding::UTF_8 && content.valid_encoding? content.force_encoding('UTF-8')
# Return as-is if it is already properly encoded
return content
end
# Step 1: Assume the content was incorrectly encoded as ISO-8859-1 (Windows-1252) and needs to be corrected # Continue decoding until no more invalid sequences are found
begin previous_content = ""
# Convert from ISO-8859-1 (or Windows-1252) back to UTF-8 while previous_content != content
# Force the encoding to ISO-8859-1 first, then encode to UTF-8 properly previous_content = content.dup
fixed_content = content.force_encoding("ISO-8859-1").encode("UTF-8")
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
# If conversion fails, leave it as is
return content
end
# Step 2: After reversing the encoding once, check if the result is valid UTF-8 # Step 1: First attempt to convert from ISO-8859-1 to UTF-8
# If it's still not valid UTF-8, force it back and try again if content.valid_encoding?
unless fixed_content.valid_encoding? # Decode from ISO-8859-1 (or Windows-1252) to UTF-8
fixed_content = fixed_content.force_encoding("ISO-8859-1").encode("UTF-8") content = content.force_encoding('ISO-8859-1').encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
else
content = previous_content # Stop if invalid encoding issues arise
end
end end
# Return the properly decoded content return content
return fixed_content
end end
# # Step 1: Try to detect encoding of the corrupted (double-encoded) content # # Step 1: Try to detect encoding of the corrupted (double-encoded) content
# detection = CharlockHolmes::EncodingDetector.detect(broken_content) # detection = CharlockHolmes::EncodingDetector.detect(broken_content)
# original_encoding = detection[:encoding] # original_encoding = detection[:encoding]