From 80a4a351b1cf30e2edc1e87e8e003d2c0de4ba12 Mon Sep 17 00:00:00 2001
From: 24c02 <163450896+24c02@users.noreply.github.com>
Date: Tue, 3 Feb 2026 17:14:19 -0500
Subject: [PATCH] write the up
---
app/views/docs/pages/what-happened.md | 61 +++++++++++
lib/tasks/import_slack_files.rake | 141 ++++++++++++++++++++++++++
2 files changed, 202 insertions(+)
create mode 100644 app/views/docs/pages/what-happened.md
create mode 100644 lib/tasks/import_slack_files.rake
diff --git a/app/views/docs/pages/what-happened.md b/app/views/docs/pages/what-happened.md
new file mode 100644
index 0000000..5750727
--- /dev/null
+++ b/app/views/docs/pages/what-happened.md
@@ -0,0 +1,61 @@
+---
+title: What happened?
+icon: question
+order: 99
+---
+
+# what happened?
+
+## the first Hack Club CDN
+
+in ~april 2020, Max & Lachlan built a CDN. a silly little thing...
+a more civilized weapon for an organization Hack Club is no longer shaped like at all...,,
+
+it worked by creating a new [Vercel](https://vercel.app) deploy every time someone wanted to add a file.
+while i'm sure vercel loved this (~~their ToS says "don't do this"~~), at some point (maybe december of 2025ish?) all the `cloud-*-hack-club-bot.vercel.app` file URLs went down.
+deployment retention policies being what they are, the deployments are not retained.
+AIUI this is because we didn't pay the bill.
+
+Hack Club CDN V1/V2 deletum est.
+
+## the second Hack Club CDN
+
+recognizing that CDNing the prior way was kinda silly, in ~february of 2025 Tom (@Deployor) wrote a new CDN!
+this was backed by a Hetzner object storage bucket, which some might say is a better design decision...
+
+eventually the card tied to the Hetzner account got receipt-locked & all the resources and data in it got nuked.
+AIUI this is because we didn't pay the bill.
+
+Hack Club CDN V3 deletum est.
+
+## but why is it _gone_?
+
+combination of two confounding factors:
+
- no backups
- two is one, one is none, we had none :-(
- and, we gave out direct bucket URLs
- this was our achilles heel, i think.
+if it's not on a domain you own, you're at the mercy of your storage provider falling out from under you.
+
+
+## i had files there!
+
+i think we failed the community here, and i'm sorry.
+i've recovered as many files as i can by scraping the originals from slack, and those are available at
+`https://cdn.hackclub.com/rescue?url=`. this is a stable URL and should work forever.
+
+here are stats on the recovery, keeping in mind that these are only the files we know about:
+
+| Source | Recovered | Unfortunately lost to time |
+|-----------------------|--------------|----------------------------|
+| Vercel via Slack | 12,126 files | 1,341 files |
+| Hetzner via Slack | 11,616 files | 725 files |
+| Vc/hel1 via Scrapbook | 21,773 files | 1,067 files |
+
+(h/t @msw for the [original pass](https://github.com/maxwofford/cdn-bucketer) at the scraper script!)
+## why should i trust that this one will last?
+very fair question given we've lost 2 CDNs and counting so far...
+this time is different because it's on a domain Hack Club owns - even if Cloudflare R2 disappears one day, we can restore a backup and redirect the `https://cdn.hackclub.com/` URLs somewhere else without you changing everywhere they're linked from. and, at least as long as i'm here......we're gonna pay the bill this time.
+
+CDN V4 is not fantastic code.
+it's written to be thrown out and replaced with something better in a few years.
+*BUT!* it is architected in such a way that when we eventually do that, **nobody will have to change their URLs**.
+
+~your pal nora <3
diff --git a/lib/tasks/import_slack_files.rake b/lib/tasks/import_slack_files.rake
new file mode 100644
index 0000000..03f00a1
--- /dev/null
+++ b/lib/tasks/import_slack_files.rake
@@ -0,0 +1,141 @@
+# frozen_string_literal: true
+
+require "csv"
+require "ruby-progressbar"
+
+namespace :import do
+ desc "Import files from Slack using a CSV with slack_file_url and slack_user_id"
+ task slack_files: :environment do
+ # Set URL options for ActiveStorage in rake context
+ ActiveStorage::Current.url_options = { host: ENV.fetch("CDN_HOST", "cdn.hackclub.com"), protocol: "https" }
+ csv_path = ENV.fetch("CSV_PATH", "files_with_slack_url.csv")
+ slack_token = ENV.fetch("SLACK_TOKEN") { raise "SLACK_TOKEN (xoxp-...) is required" }
+ thread_count = ENV.fetch("THREADS", 10).to_i
+ dry_run = ENV["DRY_RUN"] == "true"
+
+ unless File.exist?(csv_path)
+ puts "CSV file not found: #{csv_path}"
+ exit 1
+ end
+
+ rows = CSV.read(csv_path, headers: true)
+ limit = ENV.fetch("LIMIT", rows.size).to_i
+ rows = rows.first(limit)
+ total = rows.size
+
+ puts "Found #{total} files to import#{' (DRY RUN)' if dry_run}"
+ puts "Threads: #{thread_count}"
+ puts
+
+ progressbar = ProgressBar.create(
+ total: total,
+ format: "%t |%B| %c/%C (%p%%) %e",
+ title: "Importing"
+ )
+
+ stats = {
+ success: Concurrent::AtomicFixnum.new(0),
+ skipped: Concurrent::AtomicFixnum.new(0),
+ failed: Concurrent::AtomicFixnum.new(0)
+ }
+ errors = Concurrent::Array.new
+ user_cache = Concurrent::Hash.new
+ user_cache_mutex = Mutex.new
+
+ # Pre-cache existing original_urls to avoid N+1 queries
+ puts "Loading existing uploads..."
+ existing_urls = Upload.where(original_url: rows.map { |r| r["original_url"] }.compact)
+ .pluck(:original_url)
+ .to_set
+ puts "Found #{existing_urls.size} already imported"
+
+ pool = Concurrent::FixedThreadPool.new(thread_count)
+
+ rows.each do |row|
+ pool.post do
+ original_url = row["original_url"]
+ slack_file_url = row["slack_file_url"]
+ slack_user_id = row["slack_user_id"]
+ filename = row["filename"]
+
+ begin
+ # Skip rows missing required Slack data
+ if slack_file_url.blank? || filename.blank?
+ stats[:skipped].increment
+ next
+ end
+
+ if dry_run
+ stats[:success].increment
+ next
+ end
+
+ # Skip if already imported (using pre-cached set)
+ if existing_urls.include?(original_url)
+ stats[:skipped].increment
+ next
+ end
+
+ # Thread-safe user lookup/creation with caching
+ user = user_cache[slack_user_id]
+ unless user
+ user_cache_mutex.synchronize do
+ user = user_cache[slack_user_id]
+ unless user
+ user = User.find_or_create_by!(slack_id: slack_user_id) do |u|
+ u.email = nil
+ u.name = "Slack User #{slack_user_id}"
+ end
+ user_cache[slack_user_id] = user
+ end
+ end
+ end
+
+ # Download from Slack with bearer token (bypasses quota - direct model call)
+ Upload.create_from_url(
+ slack_file_url,
+ user: user,
+ provenance: :rescued,
+ original_url: original_url,
+ authorization: "Bearer #{slack_token}",
+ filename: filename
+ )
+
+ stats[:success].increment
+ rescue => e
+ stats[:failed].increment
+ errors << { id: row["id"], original_url: original_url, error: e.message }
+ ensure
+ progressbar.increment
+ end
+ end
+ end
+
+ pool.shutdown
+ pool.wait_for_termination
+
+ progressbar.finish
+
+ puts
+ puts "Import complete:"
+ puts " ✓ Success: #{stats[:success].value}"
+ puts " ○ Skipped (already exists/missing data): #{stats[:skipped].value}"
+ puts " ✗ Failed: #{stats[:failed].value}"
+
+ if errors.any?
+ puts
+ puts "Errors (first 20):"
+ errors.first(20).each do |err|
+ puts " ID #{err[:id]}: #{err[:error]}"
+ end
+
+ # Write full error log
+ error_log_path = "import_errors_#{Time.now.strftime('%Y%m%d_%H%M%S')}.csv"
+ CSV.open(error_log_path, "w") do |csv|
+ csv << %w[id original_url error]
+ errors.each { |err| csv << [err[:id], err[:original_url], err[:error]] }
+ end
+ puts "Full error log written to: #{error_log_path}"
+ end
+ end
+end