v0.42.1 Rewrite and simplication of concurrent-ruby support

This commit is contained in:
David Sainty 2024-08-17 22:23:25 +10:00
parent b44e9baf57
commit f90d0fb6ae

View File

@ -1,7 +1,7 @@
# Federated Computer, Inc. # Federated Computer, Inc.
# David Sainty <saint@federated.computer> 2024 A.D. # David Sainty <saint@federated.computer> 2024 A.D.
# Gossamer Threads to Discourse -- Migration-Import Script # Gossamer Threads to Discourse -- Migration-Import Script
# v0.42 Mutex addition for SQLite (which may be very important) # v0.42.1 Rewrite and simplication of concurrent-ruby support
require 'mysql2' require 'mysql2'
require 'open-uri' require 'open-uri'
@ -1077,7 +1077,9 @@ class GossamerForumsImporter < ImportScripts::Base
parent_post_count = parent_post_ids.count parent_post_count = parent_post_ids.count
batch_size = 100 # Set our batch size for number of posts to import in a single batch batch_size = 100 # Set our batch size for number of posts to import in a single batch
current_post_batch = 0 # Set our current batch number. This tracks the current batch of posts being processed.
#### current_post_batch = 0 # Set our current batch number. This tracks the current batch of posts being processed.
is_complete = false # Flag to indicate whether the import process is complete. is_complete = false # Flag to indicate whether the import process is complete.
# Mutex to control access to shared resources # Mutex to control access to shared resources
@ -1086,25 +1088,33 @@ class GossamerForumsImporter < ImportScripts::Base
# Run until all posts have been processed. # Run until all posts have been processed.
until is_complete until is_complete
# Query in batches, create pool, wait for termination, do it again
current_post_batch_max = current_post_batch + batch_size #### # Query in batches, create pool, wait for termination, do it again
#### current_post_batch_max = current_post_batch + batch_size
# Get the next batch of posts
current_post_batch = parent_post_ids.shift(batch_size)
break if current_post_batch.empty?
# Process each post in the current batch
current_post_batch.each do |post_id|
####### # Static pool size based on number of CPUs ####### # Static pool size based on number of CPUs
# # pool = Concurrent::FixedThreadPool.new(Concurrent.processor_count) # Create a thread pool that is bounded by processors avaialable # # pool = Concurrent::FixedThreadPool.new(Concurrent.processor_count) # Create a thread pool that is bounded by processors avaialable
# # pool = Concurrent::FixedThreadPool.new(8) # Create a thread pool of 8 pool members # # pool = Concurrent::FixedThreadPool.new(8) # Create a thread pool of 8 pool members
# Dynamically calculate the pool size based on system load to optimise performance #### # Dynamically calculate the pool size based on system load to optimise performance
pool_size = calculate_dynamic_pool_size # Dynamically calculate what the pool size "ought" to be. #### pool_size = calculate_dynamic_pool_size # Dynamically calculate what the pool size "ought" to be.
pool = Concurrent::FixedThreadPool.new(pool_size) # Create a thread pool with the calculated size #### pool = Concurrent::FixedThreadPool.new(pool_size) # Create a thread pool with the calculated size
# Process each post in the current batch #### # Process each post in the current batch
while current_post_batch < current_post_batch_max #### while current_post_batch < current_post_batch_max
post_id = parent_post_ids[current_post_batch] # Fetch the post_id for the current post #### post_id = parent_post_ids[current_post_batch] # Fetch the post_id for the current post
# Check if the post has already been processed or is incomplete #### # Check if the post has already been processed or is incomplete
post_status = post_status(post_id) #### post_status = post_status(post_id)
if post_status.nil? || post_status == 0
puts "Starting import for post_id #{post_id} in batch #{current_post_batch / batch_size + 1} with #{pool_size} threads"
# Submit the import job for the current post_id to the thread pool # Submit the import job for the current post_id to the thread pool
pool.post do pool.post do
@ -1116,17 +1126,22 @@ class GossamerForumsImporter < ImportScripts::Base
password: "yxnh93Ybbz2Nm8#mp28zCVv", password: "yxnh93Ybbz2Nm8#mp28zCVv",
database: "slowtwitch" database: "slowtwitch"
) )
puts "PP 22 -- ${post_id}" puts "PP 22 -- ${post_id}"
begin
# Use connection pooling for PostgreSQL and synchronize access to shared resources # Use connection pooling for PostgreSQL and synchronize access to shared resources
ActiveRecord::Base.connection_pool.with_connection do ActiveRecord::Base.connection_pool.with_connection do
### mutex.synchronize do post_status = post_status(post_id)
begin if post_status.nil? || post_status == 0
puts "Processing post ID: #{post_id}" puts "Starting import for post_id #{post_id}"
topic_import_job(post_id, mysql_client, sqlite_mutex) # Import topic and its replies topic_import_job(post_id, mysql_client, sqlite_mutex) # Import topic and its replies
sqlite_mutex.sychronize do sqlite_mutex.sychronize do
mark_post_as_complete(post_id) # Mark as complete in SQLite table mark_post_as_complete(post_id) # Mark as complete in SQLite table
end end
else
puts "Skipping post_id #{post_id}, already processed."
end
end
rescue => e rescue => e
puts "Error processing post ID #{post_id}: #{e.message}" puts "Error processing post ID #{post_id}: #{e.message}"
sqlite_mutex.sychronize do sqlite_mutex.sychronize do
@ -1136,22 +1151,19 @@ class GossamerForumsImporter < ImportScripts::Base
# Ensure the MariaDB connection is closed after processing # Ensure the MariaDB connection is closed after processing
mysql_client.close if mysql_client mysql_client.close if mysql_client
end end
### end
end end
end end
else
puts "Skipping post_id #{post_id}, already processed."
end
current_post_batch += 1 # Increment, moving to next post in the batch #### current_post_batch += 1 # Increment, moving to next post in the batch
break if current_post_batch >= parent_post_count #### break if current_post_batch >= parent_post_count
end
# Wait for all jobs in the current batch to finish before proceeding
pool.shutdown # Initiate thread pool shutdown after all jobs submitted pool.shutdown # Initiate thread pool shutdown after all jobs submitted
pool.wait_for_termination # Wait for all threads to finish exec pool.wait_for_termination # Wait for all threads to finish exec
# Set when all posts have been processed # Check if all posts have been processed
is_complete = true if current_post_batch >= parent_post_count #### is_complete = true if current_post_batch >= parent_post_count
is_complete = parent_post_ids.empty?
end end
end end