From 80a4a351b1cf30e2edc1e87e8e003d2c0de4ba12 Mon Sep 17 00:00:00 2001
From: 24c02 <163450896+24c02@users.noreply.github.com>
Date: Tue, 3 Feb 2026 17:14:19 -0500
Subject: [PATCH] write the up

---
 app/views/docs/pages/what-happened.md |  61 +++++++++++
 lib/tasks/import_slack_files.rake     | 141 ++++++++++++++++++++++++++
 2 files changed, 202 insertions(+)
 create mode 100644 app/views/docs/pages/what-happened.md
 create mode 100644 lib/tasks/import_slack_files.rake
diff --git a/app/views/docs/pages/what-happened.md b/app/views/docs/pages/what-happened.md
new file mode 100644
index 0000000..5750727
--- /dev/null
+++ b/app/views/docs/pages/what-happened.md
@@ -0,0 +1,61 @@
+---
+title: What happened?
+icon: question
+order: 99
+---
+
+# what happened?
+
+## the first Hack Club CDN
+
+in ~april 2020, Max & Lachlan built a CDN. a silly little thing...
+a more civilized weapon for an organization Hack Club is no longer shaped like at all...,,
+
+it worked by creating a new [Vercel](https://vercel.app) deploy every time someone wanted to add a file.
+while i'm sure vercel loved this (~~their ToS says "don't do this"~~), at some point (maybe december of 2025ish?) all the `cloud-*-hack-club-bot.vercel.app` file URLs went down.
+deployment retention policies being what they are, the deployments are not retained.
+AIUI this is because we didn't pay the bill.
+
+Hack Club CDN V1/V2 deletum est.
+
+## the second Hack Club CDN
+
+recognizing that CDNing the prior way was kinda silly, in ~february of 2025 Tom (@Deployor) wrote a new CDN!
+this was backed by a Hetzner object storage bucket, which some might say is a better design decision...
+
+eventually the card tied to the Hetzner account got receipt-locked & all the resources and data in it got nuked.
+AIUI this is because we didn't pay the bill.
+
+Hack Club CDN V3 deletum est.
+
+## but why is it _gone_?
+
+combination of two confounding factors:
+<ul><li>no backups<ul><li> two is one, one is none, we had none :-(</li></ul></li> <li>and, we gave out direct bucket URLs<ul><li>this was our achilles heel, i think.
+if it's not on a domain you own, you're at the mercy of your storage provider falling out from under you.</li></ul></li>
+</ul>
+
+## i had files there!
+
+i think we failed the community here, and i'm sorry.
+i've recovered as many files as i can by scraping the originals from slack, and those are available at
+`https://cdn.hackclub.com/rescue?url=<vercel/hel1 URL>`. this is a stable URL and should work forever.
+
+here are stats on the recovery, keeping in mind that these are only the files we know about:
+
+| Source                | Recovered    | Unfortunately lost to time |
+|-----------------------|--------------|----------------------------|
+| Vercel via Slack      | 12,126 files | 1,341 files                |
+| Hetzner via Slack     | 11,616 files | 725 files                  |
+| Vc/hel1 via Scrapbook | 21,773 files | 1,067 files                |
+
+(h/t @msw for the [original pass](https://github.com/maxwofford/cdn-bucketer) at the scraper script!)
+## why should i trust that this one will last?
+very fair question given we've lost 2 CDNs and counting so far...
+this time is different because it's on a domain Hack Club owns - even if Cloudflare R2 disappears one day, we can restore a backup and redirect the `https://cdn.hackclub.com/<id>` URLs somewhere else without you changing everywhere they're linked from. and, at least as long as i'm here......we're gonna pay the bill this time.
+
+CDN V4 is not fantastic code.
+it's written to be thrown out and replaced with something better in a few years.
+*BUT!* it is architected in such a way that when we eventually do that, **nobody will have to change their URLs**.
+
+~your pal nora <3
diff --git a/lib/tasks/import_slack_files.rake b/lib/tasks/import_slack_files.rake
new file mode 100644
index 0000000..03f00a1
--- /dev/null
+++ b/lib/tasks/import_slack_files.rake
@@ -0,0 +1,141 @@
+# frozen_string_literal: true
+
+require "csv"
+require "ruby-progressbar"
+
+namespace :import do
+  desc "Import files from Slack using a CSV with slack_file_url and slack_user_id"
+  task slack_files: :environment do
+    # Set URL options for ActiveStorage in rake context
+    ActiveStorage::Current.url_options = { host: ENV.fetch("CDN_HOST", "cdn.hackclub.com"), protocol: "https" }
+    csv_path = ENV.fetch("CSV_PATH", "files_with_slack_url.csv")
+    slack_token = ENV.fetch("SLACK_TOKEN") { raise "SLACK_TOKEN (xoxp-...) is required" }
+    thread_count = ENV.fetch("THREADS", 10).to_i
+    dry_run = ENV["DRY_RUN"] == "true"
+
+    unless File.exist?(csv_path)
+      puts "CSV file not found: #{csv_path}"
+      exit 1
+    end
+
+    rows = CSV.read(csv_path, headers: true)
+    limit = ENV.fetch("LIMIT", rows.size).to_i
+    rows = rows.first(limit)
+    total = rows.size
+
+    puts "Found #{total} files to import#{' (DRY RUN)' if dry_run}"
+    puts "Threads: #{thread_count}"
+    puts
+
+    progressbar = ProgressBar.create(
+      total: total,
+      format: "%t |%B| %c/%C (%p%%) %e",
+      title: "Importing"
+    )
+
+    stats = {
+      success: Concurrent::AtomicFixnum.new(0),
+      skipped: Concurrent::AtomicFixnum.new(0),
+      failed: Concurrent::AtomicFixnum.new(0)
+    }
+    errors = Concurrent::Array.new
+    user_cache = Concurrent::Hash.new
+    user_cache_mutex = Mutex.new
+
+    # Pre-cache existing original_urls to avoid N+1 queries
+    puts "Loading existing uploads..."
+    existing_urls = Upload.where(original_url: rows.map { |r| r["original_url"] }.compact)
+                          .pluck(:original_url)
+                          .to_set
+    puts "Found #{existing_urls.size} already imported"
+
+    pool = Concurrent::FixedThreadPool.new(thread_count)
+
+    rows.each do |row|
+      pool.post do
+        original_url = row["original_url"]
+        slack_file_url = row["slack_file_url"]
+        slack_user_id = row["slack_user_id"]
+        filename = row["filename"]
+
+        begin
+          # Skip rows missing required Slack data
+          if slack_file_url.blank? || filename.blank?
+            stats[:skipped].increment
+            next
+          end
+
+          if dry_run
+            stats[:success].increment
+            next
+          end
+
+          # Skip if already imported (using pre-cached set)
+          if existing_urls.include?(original_url)
+            stats[:skipped].increment
+            next
+          end
+
+          # Thread-safe user lookup/creation with caching
+          user = user_cache[slack_user_id]
+          unless user
+            user_cache_mutex.synchronize do
+              user = user_cache[slack_user_id]
+              unless user
+                user = User.find_or_create_by!(slack_id: slack_user_id) do |u|
+                  u.email = nil
+                  u.name = "Slack User #{slack_user_id}"
+                end
+                user_cache[slack_user_id] = user
+              end
+            end
+          end
+
+          # Download from Slack with bearer token (bypasses quota - direct model call)
+          Upload.create_from_url(
+            slack_file_url,
+            user: user,
+            provenance: :rescued,
+            original_url: original_url,
+            authorization: "Bearer #{slack_token}",
+            filename: filename
+          )
+
+          stats[:success].increment
+        rescue => e
+          stats[:failed].increment
+          errors << { id: row["id"], original_url: original_url, error: e.message }
+        ensure
+          progressbar.increment
+        end
+      end
+    end
+
+    pool.shutdown
+    pool.wait_for_termination
+
+    progressbar.finish
+
+    puts
+    puts "Import complete:"
+    puts "  ✓ Success: #{stats[:success].value}"
+    puts "  ○ Skipped (already exists/missing data): #{stats[:skipped].value}"
+    puts "  ✗ Failed: #{stats[:failed].value}"
+
+    if errors.any?
+      puts
+      puts "Errors (first 20):"
+      errors.first(20).each do |err|
+        puts "  ID #{err[:id]}: #{err[:error]}"
+      end
+
+      # Write full error log
+      error_log_path = "import_errors_#{Time.now.strftime('%Y%m%d_%H%M%S')}.csv"
+      CSV.open(error_log_path, "w") do |csv|
+        csv << %w[id original_url error]
+        errors.each { |err| csv << [err[:id], err[:original_url], err[:error]] }
+      end
+      puts "Full error log written to: #{error_log_path}"
+    end
+  end
+end