mirror of
https://github.com/System-End/cdn.git
synced 2026-04-19 15:18:15 +00:00
write the up
This commit is contained in:
parent
f54051d764
commit
80a4a351b1
2 changed files with 202 additions and 0 deletions
61
app/views/docs/pages/what-happened.md
Normal file
61
app/views/docs/pages/what-happened.md
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
---
|
||||
title: What happened?
|
||||
icon: question
|
||||
order: 99
|
||||
---
|
||||
|
||||
# what happened?
|
||||
|
||||
## the first Hack Club CDN
|
||||
|
||||
in ~april 2020, Max & Lachlan built a CDN. a silly little thing...
|
||||
a more civilized weapon for an organization Hack Club is no longer shaped like at all...,,
|
||||
|
||||
it worked by creating a new [Vercel](https://vercel.app) deploy every time someone wanted to add a file.
|
||||
while i'm sure vercel loved this (~~their ToS says "don't do this"~~), at some point (maybe december of 2025ish?) all the `cloud-*-hack-club-bot.vercel.app` file URLs went down.
|
||||
deployment retention policies being what they are, the deployments are not retained.
|
||||
AIUI this is because we didn't pay the bill.
|
||||
|
||||
Hack Club CDN V1/V2 deletum est.
|
||||
|
||||
## the second Hack Club CDN
|
||||
|
||||
recognizing that CDNing the prior way was kinda silly, in ~february of 2025 Tom (@Deployor) wrote a new CDN!
|
||||
this was backed by a Hetzner object storage bucket, which some might say is a better design decision...
|
||||
|
||||
eventually the card tied to the Hetzner account got receipt-locked & all the resources and data in it got nuked.
|
||||
AIUI this is because we didn't pay the bill.
|
||||
|
||||
Hack Club CDN V3 deletum est.
|
||||
|
||||
## but why is it _gone_?
|
||||
|
||||
combination of two confounding factors:
|
||||
<ul><li>no backups<ul><li> two is one, one is none, we had none :-(</li></ul></li> <li>and, we gave out direct bucket URLs<ul><li>this was our achilles heel, i think.
|
||||
if it's not on a domain you own, you're at the mercy of your storage provider falling out from under you.</li></ul></li>
|
||||
</ul>
|
||||
|
||||
## i had files there!
|
||||
|
||||
i think we failed the community here, and i'm sorry.
|
||||
i've recovered as many files as i can by scraping the originals from slack, and those are available at
|
||||
`https://cdn.hackclub.com/rescue?url=<vercel/hel1 URL>`. this is a stable URL and should work forever.
|
||||
|
||||
here are stats on the recovery, keeping in mind that these are only the files we know about:
|
||||
|
||||
| Source | Recovered | Unfortunately lost to time |
|
||||
|-----------------------|--------------|----------------------------|
|
||||
| Vercel via Slack | 12,126 files | 1,341 files |
|
||||
| Hetzner via Slack | 11,616 files | 725 files |
|
||||
| Vc/hel1 via Scrapbook | 21,773 files | 1,067 files |
|
||||
|
||||
(h/t @msw for the [original pass](https://github.com/maxwofford/cdn-bucketer) at the scraper script!)
|
||||
## why should i trust that this one will last?
|
||||
very fair question given we've lost 2 CDNs and counting so far...
|
||||
this time is different because it's on a domain Hack Club owns - even if Cloudflare R2 disappears one day, we can restore a backup and redirect the `https://cdn.hackclub.com/<id>` URLs somewhere else without you changing everywhere they're linked from. and, at least as long as i'm here......we're gonna pay the bill this time.
|
||||
|
||||
CDN V4 is not fantastic code.
|
||||
it's written to be thrown out and replaced with something better in a few years.
|
||||
*BUT!* it is architected in such a way that when we eventually do that, **nobody will have to change their URLs**.
|
||||
|
||||
~your pal nora <3
|
||||
141
lib/tasks/import_slack_files.rake
Normal file
141
lib/tasks/import_slack_files.rake
Normal file
|
|
@ -0,0 +1,141 @@
|
|||
# frozen_string_literal: true
|
||||
|
||||
require "csv"
|
||||
require "ruby-progressbar"
|
||||
|
||||
namespace :import do
|
||||
desc "Import files from Slack using a CSV with slack_file_url and slack_user_id"
|
||||
task slack_files: :environment do
|
||||
# Set URL options for ActiveStorage in rake context
|
||||
ActiveStorage::Current.url_options = { host: ENV.fetch("CDN_HOST", "cdn.hackclub.com"), protocol: "https" }
|
||||
csv_path = ENV.fetch("CSV_PATH", "files_with_slack_url.csv")
|
||||
slack_token = ENV.fetch("SLACK_TOKEN") { raise "SLACK_TOKEN (xoxp-...) is required" }
|
||||
thread_count = ENV.fetch("THREADS", 10).to_i
|
||||
dry_run = ENV["DRY_RUN"] == "true"
|
||||
|
||||
unless File.exist?(csv_path)
|
||||
puts "CSV file not found: #{csv_path}"
|
||||
exit 1
|
||||
end
|
||||
|
||||
rows = CSV.read(csv_path, headers: true)
|
||||
limit = ENV.fetch("LIMIT", rows.size).to_i
|
||||
rows = rows.first(limit)
|
||||
total = rows.size
|
||||
|
||||
puts "Found #{total} files to import#{' (DRY RUN)' if dry_run}"
|
||||
puts "Threads: #{thread_count}"
|
||||
puts
|
||||
|
||||
progressbar = ProgressBar.create(
|
||||
total: total,
|
||||
format: "%t |%B| %c/%C (%p%%) %e",
|
||||
title: "Importing"
|
||||
)
|
||||
|
||||
stats = {
|
||||
success: Concurrent::AtomicFixnum.new(0),
|
||||
skipped: Concurrent::AtomicFixnum.new(0),
|
||||
failed: Concurrent::AtomicFixnum.new(0)
|
||||
}
|
||||
errors = Concurrent::Array.new
|
||||
user_cache = Concurrent::Hash.new
|
||||
user_cache_mutex = Mutex.new
|
||||
|
||||
# Pre-cache existing original_urls to avoid N+1 queries
|
||||
puts "Loading existing uploads..."
|
||||
existing_urls = Upload.where(original_url: rows.map { |r| r["original_url"] }.compact)
|
||||
.pluck(:original_url)
|
||||
.to_set
|
||||
puts "Found #{existing_urls.size} already imported"
|
||||
|
||||
pool = Concurrent::FixedThreadPool.new(thread_count)
|
||||
|
||||
rows.each do |row|
|
||||
pool.post do
|
||||
original_url = row["original_url"]
|
||||
slack_file_url = row["slack_file_url"]
|
||||
slack_user_id = row["slack_user_id"]
|
||||
filename = row["filename"]
|
||||
|
||||
begin
|
||||
# Skip rows missing required Slack data
|
||||
if slack_file_url.blank? || filename.blank?
|
||||
stats[:skipped].increment
|
||||
next
|
||||
end
|
||||
|
||||
if dry_run
|
||||
stats[:success].increment
|
||||
next
|
||||
end
|
||||
|
||||
# Skip if already imported (using pre-cached set)
|
||||
if existing_urls.include?(original_url)
|
||||
stats[:skipped].increment
|
||||
next
|
||||
end
|
||||
|
||||
# Thread-safe user lookup/creation with caching
|
||||
user = user_cache[slack_user_id]
|
||||
unless user
|
||||
user_cache_mutex.synchronize do
|
||||
user = user_cache[slack_user_id]
|
||||
unless user
|
||||
user = User.find_or_create_by!(slack_id: slack_user_id) do |u|
|
||||
u.email = nil
|
||||
u.name = "Slack User #{slack_user_id}"
|
||||
end
|
||||
user_cache[slack_user_id] = user
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Download from Slack with bearer token (bypasses quota - direct model call)
|
||||
Upload.create_from_url(
|
||||
slack_file_url,
|
||||
user: user,
|
||||
provenance: :rescued,
|
||||
original_url: original_url,
|
||||
authorization: "Bearer #{slack_token}",
|
||||
filename: filename
|
||||
)
|
||||
|
||||
stats[:success].increment
|
||||
rescue => e
|
||||
stats[:failed].increment
|
||||
errors << { id: row["id"], original_url: original_url, error: e.message }
|
||||
ensure
|
||||
progressbar.increment
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
pool.shutdown
|
||||
pool.wait_for_termination
|
||||
|
||||
progressbar.finish
|
||||
|
||||
puts
|
||||
puts "Import complete:"
|
||||
puts " ✓ Success: #{stats[:success].value}"
|
||||
puts " ○ Skipped (already exists/missing data): #{stats[:skipped].value}"
|
||||
puts " ✗ Failed: #{stats[:failed].value}"
|
||||
|
||||
if errors.any?
|
||||
puts
|
||||
puts "Errors (first 20):"
|
||||
errors.first(20).each do |err|
|
||||
puts " ID #{err[:id]}: #{err[:error]}"
|
||||
end
|
||||
|
||||
# Write full error log
|
||||
error_log_path = "import_errors_#{Time.now.strftime('%Y%m%d_%H%M%S')}.csv"
|
||||
CSV.open(error_log_path, "w") do |csv|
|
||||
csv << %w[id original_url error]
|
||||
errors.each { |err| csv << [err[:id], err[:original_url], err[:error]] }
|
||||
end
|
||||
puts "Full error log written to: #{error_log_path}"
|
||||
end
|
||||
end
|
||||
end
|
||||
Loading…
Add table
Reference in a new issue