From 0ad1482f95e6fc022e7bb2465b7e4c710624af3e Mon Sep 17 00:00:00 2001 From: Ross Trottier Date: Thu, 15 Aug 2024 09:24:24 -0600 Subject: [PATCH 1/4] optimized main query for post import --- gossamer_forums.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gossamer_forums.rb b/gossamer_forums.rb index f281b23..3c07108 100644 --- a/gossamer_forums.rb +++ b/gossamer_forums.rb @@ -946,7 +946,7 @@ class GossamerForumsImporter < ImportScripts::Base # Attachment example: highest_processed_post_id = 1359862 # Execute the query to get all posts ordered by post_id - execute_query("SELECT * FROM gforum_Post ORDER BY post_id").each do |row| + execute_query("SELECT post_id, user_id_fk, forum_id_fk, post_root_id, post_subject, post_time, post_message, post_likes, post_father_id, post_replies FROM gforum_Post ORDER BY post_id").each do |row| post_id = row['post_id'].to_i # Skip posts that have already been processed From b73be6d27f42becf1be4dcf86c99cf126cc5bb96 Mon Sep 17 00:00:00 2001 From: Ross Trottier Date: Thu, 15 Aug 2024 11:49:52 -0600 Subject: [PATCH 2/4] concurrency outline --- gossamer_forums.rb | 48 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/gossamer_forums.rb b/gossamer_forums.rb index 3c07108..e9ec085 100644 --- a/gossamer_forums.rb +++ b/gossamer_forums.rb @@ -14,6 +14,8 @@ require 'fileutils' require 'csv' require 'time' +require 'concurrent' + require File.expand_path("../../../../config/environment", __FILE__) # require_relative '../base' require File.expand_path("../../../../script/import_scripts/base", __FILE__) @@ -925,6 +927,50 @@ class GossamerForumsImporter < ImportScripts::Base result ? result['post_views'] : 0 end + #THREADING OUTLINE HERE -------------------------------------------- + + def threaded_topic_import + # Get list of IDS that have no parent ID - SELECT post_id FROM gforum_Post WHERE post_root_id = 0; + parent_post_ids = execute_query("SELECT post_id FROM gforum_Post WHERE post_root_id = 0") + + parent_post_count = parent_post_ids.count + batch_size = 100 #set our batch size + current_post_batch = 0 #set our current batch number + is_complete = false + + until is_complete + # Query in batches, create pool, wait for termination, do it again + # SELECT post_id FROM gforum_Post WHERE post_root_id = 0 ORDER BY post_id + current_post_batch_max = current_post_batch + batch_size + pool = Concurrent::FixedThreadPool.new(Concurrent.processor_count) #create thread pool that is bounded by processors avaialable + + while current_post_batch < current_post_batch_max + post_id = parent_post_ids[current_post_batch] + pool.post do + topic_import_job(post_id) + end + current_post_batch += 1 + end + + pool.shutdown + pool.wait_for_termination + + if current_post_batch >= parent_post_count + is_complete = true + end + end + end + + def topic_import_job(post_id) + #Here is where you can import the entire topic + #Get post -- SELECT post_id, user_id_fk, forum_id_fk, post_root_id, post_subject, post_time, post_message, post_father_id, post_replies FROM gforum_Post WHERE post_id = post_id + #check if exists, create if not + #get children, create -- SELECT post_id, user_id_fk, forum_id_fk, post_root_id, post_subject, post_time, post_message, post_father_id, post_replies FROM gforum_Post WHERE post_root_id = post_id + #this parts needs to be synchronously to avoid race conditions + end + + #------------------------------------------------------------------- + # Import topics and posts from Gossamer Forums to Discourse def import_topics_and_posts_with_attachments puts "Importing topics and posts with attachments..." @@ -946,7 +992,7 @@ class GossamerForumsImporter < ImportScripts::Base # Attachment example: highest_processed_post_id = 1359862 # Execute the query to get all posts ordered by post_id - execute_query("SELECT post_id, user_id_fk, forum_id_fk, post_root_id, post_subject, post_time, post_message, post_likes, post_father_id, post_replies FROM gforum_Post ORDER BY post_id").each do |row| + execute_query("SELECT post_id, user_id_fk, forum_id_fk, post_root_id, post_subject, post_time, post_message, post_father_id, post_replies FROM gforum_Post ORDER BY post_id").each do |row| post_id = row['post_id'].to_i # Skip posts that have already been processed From 16c261bedef29c242d2f238bf6cfe8274f93f3ce Mon Sep 17 00:00:00 2001 From: Ross Trottier Date: Thu, 15 Aug 2024 12:51:07 -0600 Subject: [PATCH 3/4] index out of range issue fixed --- gossamer_forums.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/gossamer_forums.rb b/gossamer_forums.rb index e9ec085..04f5a7f 100644 --- a/gossamer_forums.rb +++ b/gossamer_forums.rb @@ -950,6 +950,7 @@ class GossamerForumsImporter < ImportScripts::Base topic_import_job(post_id) end current_post_batch += 1 + break if current_post_batch >= parent_post_count end pool.shutdown From f3b1f0416ddd63751f1834648784c3f90c249ef8 Mon Sep 17 00:00:00 2001 From: Ross Trottier Date: Thu, 15 Aug 2024 12:58:03 -0600 Subject: [PATCH 4/4] added suggestion for thread count tweaking --- gossamer_forums.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gossamer_forums.rb b/gossamer_forums.rb index 04f5a7f..5327f2f 100644 --- a/gossamer_forums.rb +++ b/gossamer_forums.rb @@ -940,9 +940,9 @@ class GossamerForumsImporter < ImportScripts::Base until is_complete # Query in batches, create pool, wait for termination, do it again - # SELECT post_id FROM gforum_Post WHERE post_root_id = 0 ORDER BY post_id + # SELECT post_id FROM gforum_Post WHERE post_root_id = 0 ORDER BY post_id current_post_batch_max = current_post_batch + batch_size - pool = Concurrent::FixedThreadPool.new(Concurrent.processor_count) #create thread pool that is bounded by processors avaialable + pool = Concurrent::FixedThreadPool.new(Concurrent.processor_count) #create thread pool that is bounded by processors avaialable, however play with the number to see what works best while current_post_batch < current_post_batch_max post_id = parent_post_ids[current_post_batch]