v0.6 Further attempt to get this reverse dobule encoding right now
This commit is contained in:
parent
5f3708b307
commit
9dd478697a
@ -31,31 +31,58 @@ class GossamerForumsCorrectEncoding < ImportScripts::Base
|
|||||||
end
|
end
|
||||||
|
|
||||||
# Method to detect and fix text encoding
|
# Method to detect and fix text encoding
|
||||||
def fix_text_encoding(broken_content)
|
def fix_text_encoding(content)
|
||||||
# Step 1: Try to detect encoding of the corrupted (double-encoded) content
|
# Detect if content is already UTF-8 (should be the target encoding)
|
||||||
detection = CharlockHolmes::EncodingDetector.detect(broken_content)
|
if content.encoding == Encoding::UTF_8 && content.valid_encoding?
|
||||||
original_encoding = detection[:encoding]
|
# Return as-is if it is already properly encoded
|
||||||
puts "Original encoding detected: #{original_encoding}"
|
return content
|
||||||
|
|
||||||
# Step 2: First decode the double-encoded content
|
|
||||||
begin
|
|
||||||
# Convert the content assuming it was double-encoded, so decode twice
|
|
||||||
# First, convert from the detected encoding (ISO-8859-1 or windows-1252) to UTF-8
|
|
||||||
first_pass = CharlockHolmes::Converter.convert(broken_content, original_encoding, 'UTF-8')
|
|
||||||
|
|
||||||
# Step 3: Now re-interpret that output as if it's broken UTF-8 and convert it back to UTF-8
|
|
||||||
fixed_content = CharlockHolmes::Converter.convert(first_pass, 'UTF-8', 'UTF-8')
|
|
||||||
|
|
||||||
rescue => e
|
|
||||||
puts "Error during encoding fix: #{e.message}"
|
|
||||||
puts e.backtrace.join("\n") # Print the full stack trace
|
|
||||||
|
|
||||||
fixed_content = broken_content # Fall back to the broken content if decoding fails
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Step 1: Assume the content was incorrectly encoded as ISO-8859-1 (Windows-1252) and needs to be corrected
|
||||||
|
begin
|
||||||
|
# Convert from ISO-8859-1 (or Windows-1252) back to UTF-8
|
||||||
|
# Force the encoding to ISO-8859-1 first, then encode to UTF-8 properly
|
||||||
|
fixed_content = content.force_encoding("ISO-8859-1").encode("UTF-8")
|
||||||
|
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
|
||||||
|
# If conversion fails, leave it as is
|
||||||
|
return content
|
||||||
|
end
|
||||||
|
|
||||||
|
# Step 2: After reversing the encoding once, check if the result is valid UTF-8
|
||||||
|
# If it's still not valid UTF-8, force it back and try again
|
||||||
|
unless fixed_content.valid_encoding?
|
||||||
|
fixed_content = fixed_content.force_encoding("ISO-8859-1").encode("UTF-8")
|
||||||
|
end
|
||||||
|
|
||||||
|
# Return the properly decoded content
|
||||||
return fixed_content
|
return fixed_content
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# # Step 1: Try to detect encoding of the corrupted (double-encoded) content
|
||||||
|
# detection = CharlockHolmes::EncodingDetector.detect(broken_content)
|
||||||
|
# original_encoding = detection[:encoding]
|
||||||
|
# puts "Original encoding detected: #{original_encoding}"
|
||||||
|
#
|
||||||
|
# # Step 2: First decode the double-encoded content
|
||||||
|
# begin
|
||||||
|
# # Convert the content assuming it was double-encoded, so decode twice
|
||||||
|
# # First, convert from the detected encoding (ISO-8859-1 or windows-1252) to UTF-8
|
||||||
|
# first_pass = CharlockHolmes::Converter.convert(broken_content, original_encoding, 'UTF-8')
|
||||||
|
#
|
||||||
|
# # Step 3: Now re-interpret that output as if it's broken UTF-8 and convert it back to UTF-8
|
||||||
|
# fixed_content = CharlockHolmes::Converter.convert(first_pass, 'UTF-8', 'UTF-8')
|
||||||
|
#
|
||||||
|
# rescue => e
|
||||||
|
# puts "Error during encoding fix: #{e.message}"
|
||||||
|
# puts e.backtrace.join("\n") # Print the full stack trace
|
||||||
|
#
|
||||||
|
# fixed_content = broken_content # Fall back to the broken content if decoding fails
|
||||||
|
# end
|
||||||
|
#
|
||||||
|
# return fixed_content
|
||||||
|
#end
|
||||||
|
|
||||||
# # Detect encoding
|
# # Detect encoding
|
||||||
# detection = CharlockHolmes::EncodingDetector.detect(raw_content)
|
# detection = CharlockHolmes::EncodingDetector.detect(raw_content)
|
||||||
# original_encoding = detection[:encoding]
|
# original_encoding = detection[:encoding]
|
||||||
|
Loading…
Reference in New Issue
Block a user