v0.6 Further attempt to get this reverse dobule encoding right now
This commit is contained in:
parent
3331675fb2
commit
5f3708b307
@ -1,7 +1,7 @@
|
|||||||
# Federated Computer, Inc.
|
# Federated Computer, Inc.
|
||||||
# David Sainty <saint@federated.computer> 2024 A.D.
|
# David Sainty <saint@federated.computer> 2024 A.D.
|
||||||
# Gossamer Threads to Discourse -- Correct Encoding
|
# Gossamer Threads to Discourse -- Correct Encoding
|
||||||
# v0.4 Debugging
|
# v0.6 Further attempt to get this reverse dobule encoding right now
|
||||||
|
|
||||||
require 'mysql2'
|
require 'mysql2'
|
||||||
require 'active_record'
|
require 'active_record'
|
||||||
@ -24,18 +24,43 @@ class GossamerForumsCorrectEncoding < ImportScripts::Base
|
|||||||
)
|
)
|
||||||
rescue Mysql2::Error => e
|
rescue Mysql2::Error => e
|
||||||
puts "Error connecting to MySQL: #{e.message}"
|
puts "Error connecting to MySQL: #{e.message}"
|
||||||
|
puts e.backtrace.join("\n") # Print the full stack trace
|
||||||
exit 1
|
exit 1
|
||||||
end
|
end
|
||||||
@batch_size = 1000 # Number of posts to process in each batch
|
@batch_size = 1000 # Number of posts to process in each batch
|
||||||
end
|
end
|
||||||
|
|
||||||
# Method to detect and fix text encoding
|
# Method to detect and fix text encoding
|
||||||
def fix_text_encoding(raw_content)
|
def fix_text_encoding(broken_content)
|
||||||
# Detect encoding
|
# Step 1: Try to detect encoding of the corrupted (double-encoded) content
|
||||||
detection = CharlockHolmes::EncodingDetector.detect(raw_content)
|
detection = CharlockHolmes::EncodingDetector.detect(broken_content)
|
||||||
original_encoding = detection[:encoding]
|
original_encoding = detection[:encoding]
|
||||||
puts "Original encoding detected: #{original_encoding}"
|
puts "Original encoding detected: #{original_encoding}"
|
||||||
|
|
||||||
|
# Step 2: First decode the double-encoded content
|
||||||
|
begin
|
||||||
|
# Convert the content assuming it was double-encoded, so decode twice
|
||||||
|
# First, convert from the detected encoding (ISO-8859-1 or windows-1252) to UTF-8
|
||||||
|
first_pass = CharlockHolmes::Converter.convert(broken_content, original_encoding, 'UTF-8')
|
||||||
|
|
||||||
|
# Step 3: Now re-interpret that output as if it's broken UTF-8 and convert it back to UTF-8
|
||||||
|
fixed_content = CharlockHolmes::Converter.convert(first_pass, 'UTF-8', 'UTF-8')
|
||||||
|
|
||||||
|
rescue => e
|
||||||
|
puts "Error during encoding fix: #{e.message}"
|
||||||
|
puts e.backtrace.join("\n") # Print the full stack trace
|
||||||
|
|
||||||
|
fixed_content = broken_content # Fall back to the broken content if decoding fails
|
||||||
|
end
|
||||||
|
|
||||||
|
return fixed_content
|
||||||
|
end
|
||||||
|
|
||||||
|
# # Detect encoding
|
||||||
|
# detection = CharlockHolmes::EncodingDetector.detect(raw_content)
|
||||||
|
# original_encoding = detection[:encoding]
|
||||||
|
# puts "Original encoding detected: #{original_encoding}"
|
||||||
|
|
||||||
# # Force the encoding to the detected one, then covnert to UTF-8
|
# # Force the encoding to the detected one, then covnert to UTF-8
|
||||||
# if original_encoding == 'ISO-8859-1' || original_encoding == 'windows-1252'
|
# if original_encoding == 'ISO-8859-1' || original_encoding == 'windows-1252'
|
||||||
# # For Windows-1252 or ISO-8859-1, force the encoding and convert to UTF-8
|
# # For Windows-1252 or ISO-8859-1, force the encoding and convert to UTF-8
|
||||||
@ -46,20 +71,20 @@ class GossamerForumsCorrectEncoding < ImportScripts::Base
|
|||||||
# text.encode('UTF-8', original_encoding, invalid: :replace, undef: :replace, replace: '?')
|
# text.encode('UTF-8', original_encoding, invalid: :replace, undef: :replace, replace: '?')
|
||||||
# end
|
# end
|
||||||
|
|
||||||
if original_encoding
|
# if original_encoding
|
||||||
begin
|
# begin
|
||||||
decoded_content = CharlockHolmes::Converter.convert(raw_content, original_encoding, 'UTF-8')
|
# decoded_content = CharlockHolmes::Converter.convert(raw_content, original_encoding, 'UTF-8')
|
||||||
rescue => e
|
# rescue => e
|
||||||
puts "Error during encoding conversion: #{e.message}"
|
# puts "Error during encoding conversion: #{e.message}"
|
||||||
decoded_content = raw_content # Fall back to raw content if decoding fails
|
# decoded_content = raw_content # Fall back to raw content if decoding fails
|
||||||
end
|
# end
|
||||||
else
|
# else
|
||||||
decoded_content = raw_content # Fallback if encoding detection fails
|
# decoded_content = raw_content # Fallback if encoding detection fails
|
||||||
end
|
# end
|
||||||
|
#
|
||||||
# Step 3: Ensure the content is now correctly in UTF-8 (no need to encode again)
|
# # Step 3: Ensure the content is now correctly in UTF-8 (no need to encode again)
|
||||||
return decoded_content
|
# return decoded_content
|
||||||
end
|
# end
|
||||||
|
|
||||||
# rescue StandardError => e
|
# rescue StandardError => e
|
||||||
# puts "Error during encoding conversion: #{e.message}"
|
# puts "Error during encoding conversion: #{e.message}"
|
||||||
|
Loading…
Reference in New Issue
Block a user