From f685c631c93d59d5e867e2438cd4a77dd84f073d Mon Sep 17 00:00:00 2001 From: Richard Tibbles Date: Wed, 24 Jun 2026 22:51:28 -0700 Subject: [PATCH 1/2] feat: add GCS resumable upload storage helpers - supports_resumable flag on the GCS storage backends - get_stored_object_md5: dedup lookup against an object's GCS-computed md5 - create_resumable_upload_session: pins md5 + declared-size metadata - hex_to_base64 checksum helper Part of #5975. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01UzUP3UYP4cLyouvssXyekj --- .../tests/test_storage_common.py | 17 +++++++++++++ .../contentcuration/utils/gcs_storage.py | 4 ++++ .../contentcuration/utils/storage_common.py | 24 +++++++++++++++++++ 3 files changed, 45 insertions(+) diff --git a/contentcuration/contentcuration/tests/test_storage_common.py b/contentcuration/contentcuration/tests/test_storage_common.py index f89534c194..da50454803 100644 --- a/contentcuration/contentcuration/tests/test_storage_common.py +++ b/contentcuration/contentcuration/tests/test_storage_common.py @@ -12,7 +12,9 @@ from .base import StudioTestCase from contentcuration.models import generate_object_storage_name +from contentcuration.utils.gcs_storage import GoogleCloudStorage from contentcuration.utils.storage_common import _get_gcs_presigned_put_url +from contentcuration.utils.storage_common import create_resumable_upload_session from contentcuration.utils.storage_common import determine_content_type from contentcuration.utils.storage_common import get_presigned_upload_url from contentcuration.utils.storage_common import UnknownStorageBackendError @@ -157,6 +159,21 @@ def test_generate_signed_url_called_with_required_arguments(self): content_type=mimetype, ) + def test_create_resumable_session_pins_md5_size_and_returns_url(self): + storage = GoogleCloudStorage(self.client, "bucket") + blob = self.client.get_bucket.return_value.blob.return_value + blob.create_resumable_upload_session.return_value = "https://session.url" + + url = create_resumable_upload_session( + "storage/a/b/abc.jpg", "md5==\n", 2048, storage=storage + ) + + assert url == "https://session.url" + assert blob.md5_hash == "md5==" # whitespace stripped + assert blob.content_type == "image/jpeg" + assert blob.metadata == {"declared-size": "2048"} + blob.create_resumable_upload_session.assert_called_once() + class S3StoragePresignedURLUnitTestCase(StudioTestCase): """ diff --git a/contentcuration/contentcuration/utils/gcs_storage.py b/contentcuration/contentcuration/utils/gcs_storage.py index 5c4a425aec..048a91cc45 100644 --- a/contentcuration/contentcuration/utils/gcs_storage.py +++ b/contentcuration/contentcuration/utils/gcs_storage.py @@ -27,6 +27,8 @@ def _create_default_client( class GoogleCloudStorage(Storage): + supports_resumable = True + def __init__(self, client, bucket_name): self.client = client self.bucket = self.client.get_bucket(bucket_name) @@ -217,6 +219,8 @@ def _is_file_empty(fobj): class CompositeGCS(Storage): + supports_resumable = True + def __init__(self): self.backends = [] self.backends.append( diff --git a/contentcuration/contentcuration/utils/storage_common.py b/contentcuration/contentcuration/utils/storage_common.py index 10d79bd5c5..1be872391e 100644 --- a/contentcuration/contentcuration/utils/storage_common.py +++ b/contentcuration/contentcuration/utils/storage_common.py @@ -1,3 +1,4 @@ +import codecs import mimetypes import os from datetime import timedelta @@ -35,6 +36,29 @@ def determine_content_type(filename): return typ +def hex_to_base64(hexdigest): + """Convert a hex-encoded digest (e.g. an MD5 checksum) to base64.""" + return codecs.encode(codecs.decode(hexdigest, "hex"), "base64").decode() + + +def create_resumable_upload_session(filepath, md5_b64, size, storage=default_storage): + client = storage.get_client() + blob = client.get_bucket(settings.AWS_S3_BUCKET_NAME).blob(filepath) + blob.md5_hash = md5_b64.strip() + blob.content_type = determine_content_type(filepath) + blob.metadata = {"declared-size": str(size)} + return blob.create_resumable_upload_session(client=client) + + +def get_stored_object_md5_b64(filepath, storage=default_storage): + if not getattr(storage, "supports_resumable", False): + return None + blob = ( + storage.get_client().get_bucket(settings.AWS_S3_BUCKET_NAME).get_blob(filepath) + ) + return blob.md5_hash if blob is not None else None + + def get_presigned_upload_url( filepath, md5sum_b64, From 47aebd89726d42f9cd0f76924dae2cf85fe7cbc9 Mon Sep 17 00:00:00 2001 From: Richard Tibbles Date: Wed, 24 Jun 2026 22:51:36 -0700 Subject: [PATCH 2/2] feat: add opt-in resumable scheme to the upload_url endpoint - accept a `resumable` flag (defaults off) - GCS: skip when the stored md5 matches the checksum, else return a server-initiated resumable session URI - non-GCS backends fall back to single-PUT - reject non-resumable uploads over 500 MB - make `size` an IntegerField, dropping the redundant float casts Part of #5975. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01UzUP3UYP4cLyouvssXyekj --- .../tests/viewsets/test_file.py | 99 +++++++++++++++++++ .../contentcuration/viewsets/file.py | 50 +++++++--- 2 files changed, 133 insertions(+), 16 deletions(-) diff --git a/contentcuration/contentcuration/tests/viewsets/test_file.py b/contentcuration/contentcuration/tests/viewsets/test_file.py index 9737c7f4bd..77910fd4a9 100644 --- a/contentcuration/contentcuration/tests/viewsets/test_file.py +++ b/contentcuration/contentcuration/tests/viewsets/test_file.py @@ -1,4 +1,6 @@ +import codecs import uuid +from unittest import mock from django.urls import reverse from le_utils.constants import content_kinds @@ -12,6 +14,8 @@ from contentcuration.tests.viewsets.base import generate_delete_event from contentcuration.tests.viewsets.base import generate_update_event from contentcuration.tests.viewsets.base import SyncTestMixin +from contentcuration.viewsets.file import FileUploadURLSerializer +from contentcuration.viewsets.file import MAX_NON_RESUMABLE_UPLOAD_SIZE from contentcuration.viewsets.sync.constants import CONTENTNODE from contentcuration.viewsets.sync.constants import FILE @@ -546,6 +550,9 @@ def test_mismatched_preset_upload(self): def test_insufficient_storage(self): self.file["size"] = 100000000000000 + self.file[ + "resumable" + ] = True # resumable bypasses the >500MB guard so this still exercises the quota (412) path self.client.force_authenticate(user=self.user) response = self.client.post( @@ -590,6 +597,26 @@ def test_duration_zero(self): self.assertEqual(response.status_code, 400) + def test_fractional_size_rejected(self): + s = FileUploadURLSerializer(data={**self.file, "size": 1000.5}) + assert not s.is_valid() + assert "size" in s.errors + + def test_large_non_resumable_rejected(self): + self.file["size"] = MAX_NON_RESUMABLE_UPLOAD_SIZE + 1 + self.client.force_authenticate(user=self.user) + resp = self.client.post(reverse("file-upload-url"), self.file, format="json") + assert resp.status_code == 400 + + def test_large_resumable_allowed(self): + self.user.disk_space = 10 * 1024 * 1024 * 1024 + self.user.save() + self.file["size"] = MAX_NON_RESUMABLE_UPLOAD_SIZE + 1 + self.file["resumable"] = True + self.client.force_authenticate(user=self.user) + resp = self.client.post(reverse("file-upload-url"), self.file, format="json") + assert resp.status_code == 200 + class ContentIDTestCase(SyncTestMixin, StudioAPITestCase): def setUp(self): @@ -763,3 +790,75 @@ def test_content_id__thumbnails_dont_update_content_id(self): self.assertEqual( copied_node_content_id_before_upload, copied_node_content_id_after_upload ) + + +class ResumableUploadURLTestCase(StudioAPITestCase): + def setUp(self): + super(ResumableUploadURLTestCase, self).setUp() + self.user = testdata.user() + # Give user enough quota to handle resumable uploads + self.user.disk_space = 10 * 1024 * 1024 * 1024 + self.user.save() + self.file = { + "size": 1000, + "checksum": uuid.uuid4().hex, + "name": "le_studio", + "file_format": file_formats.MP3, + "preset": format_presets.AUDIO, + "duration": 10.123, + "resumable": True, + } + + def _checksum_b64(self): + return ( + codecs.encode(codecs.decode(self.file["checksum"], "hex"), "base64") + .decode() + .strip() + ) + + @mock.patch("contentcuration.viewsets.file.default_storage") + @mock.patch("contentcuration.viewsets.file.create_resumable_upload_session") + @mock.patch("contentcuration.viewsets.file.get_stored_object_md5_b64") + def test_resumable_returns_session_when_not_stored( + self, mock_md5, mock_session, mock_storage + ): + mock_storage.supports_resumable = True + mock_md5.return_value = None + mock_session.return_value = "https://session.url" + self.client.force_authenticate(user=self.user) + resp = self.client.post(reverse("file-upload-url"), self.file, format="json") + assert resp.status_code == 200 + data = resp.json() + assert data["resumable"] is True + assert data["uploadURL"] == "https://session.url" + assert data["alreadyUploaded"] is False + assert "file" in data + assert data["file"]["id"] + mock_session.assert_called_once() + + @mock.patch("contentcuration.viewsets.file.default_storage") + @mock.patch("contentcuration.viewsets.file.create_resumable_upload_session") + @mock.patch("contentcuration.viewsets.file.get_stored_object_md5_b64") + def test_resumable_skips_when_already_stored( + self, mock_md5, mock_session, mock_storage + ): + mock_storage.supports_resumable = True + mock_md5.return_value = self._checksum_b64() + self.client.force_authenticate(user=self.user) + resp = self.client.post(reverse("file-upload-url"), self.file, format="json") + data = resp.json() + assert data["resumable"] is True and data["alreadyUploaded"] is True + assert data["uploadURL"] is None + assert "file" in data + assert data["file"]["id"] + mock_session.assert_not_called() + + def test_resumable_falls_back_to_single_put_on_s3(self): + # default_storage is S3 in the test env → no resumable support + self.client.force_authenticate(user=self.user) + resp = self.client.post(reverse("file-upload-url"), self.file, format="json") + data = resp.json() + assert data["resumable"] is False + assert "uploadURL" in data + assert "file" in data + assert data["file"]["id"] diff --git a/contentcuration/contentcuration/viewsets/file.py b/contentcuration/contentcuration/viewsets/file.py index afadbff0cb..9f9422a242 100644 --- a/contentcuration/contentcuration/viewsets/file.py +++ b/contentcuration/contentcuration/viewsets/file.py @@ -1,7 +1,7 @@ -import codecs import math from django.core.exceptions import PermissionDenied +from django.core.files.storage import default_storage from django.http import HttpResponseBadRequest from le_utils.constants import file_formats from le_utils.constants import format_presets @@ -18,7 +18,10 @@ from contentcuration.models import generate_storage_url from contentcuration.utils.cache import ResourceSizeCache from contentcuration.utils.sentry import report_exception +from contentcuration.utils.storage_common import create_resumable_upload_session from contentcuration.utils.storage_common import get_presigned_upload_url +from contentcuration.utils.storage_common import get_stored_object_md5_b64 +from contentcuration.utils.storage_common import hex_to_base64 from contentcuration.utils.user import calculate_user_storage from contentcuration.viewsets.base import BulkDeleteMixin from contentcuration.viewsets.base import BulkListSerializer @@ -34,6 +37,8 @@ PRESET_LOOKUP = {p.id: p for p in format_presets.PRESETLIST} +MAX_NON_RESUMABLE_UPLOAD_SIZE = 500 * 1024 * 1024 + class StrictFloatField(serializers.FloatField): def to_internal_value(self, data): @@ -48,7 +53,7 @@ class FileUploadURLSerializer(serializers.Serializer): """ Serializer to validate inputs for the upload_url endpoint. Required: - - size: a float value + - size: an integer value (bytes) - checksum: a 32-digit hex string - name: a string (note: mapped from request.data['name']) - file_format: a valid file format choice from file_formats.choices @@ -57,12 +62,13 @@ class FileUploadURLSerializer(serializers.Serializer): - duration: a number that will be floored to an integer and must be > 0 """ - size = serializers.FloatField(required=True) + size = serializers.IntegerField(required=True) checksum = serializers.RegexField(regex=r"^[0-9a-f]{32}$", required=True) name = serializers.CharField(required=True) file_format = serializers.ChoiceField(choices=file_formats.choices, required=True) preset = serializers.ChoiceField(choices=format_presets.choices, required=True) duration = StrictFloatField(required=False, allow_null=True) + resumable = serializers.BooleanField(required=False, default=False) def validate_duration(self, value): if value is None: @@ -89,6 +95,10 @@ def validate(self, attrs): raise serializers.ValidationError( f"File format {attrs['file_format']} is not an allowed format for this preset {attrs['preset']}" ) + if not attrs["resumable"] and attrs["size"] > MAX_NON_RESUMABLE_UPLOAD_SIZE: + raise serializers.ValidationError( + "Files larger than 500 MB must use a resumable upload." + ) return attrs @@ -235,26 +245,37 @@ def upload_url(self, request): file_format = validated_data["file_format"] preset = validated_data["preset"] duration = validated_data.get("duration") + resumable = validated_data["resumable"] try: - request.user.check_space(float(size), checksum) + request.user.check_space(size, checksum) except PermissionDenied: return HttpResponseBadRequest( reason="Not enough space. Check your storage under Settings page.", status=412, ) - might_skip = File.objects.filter(checksum=checksum).exists() - filepath = generate_object_storage_name( checksum, filename, default_ext=file_format ) - checksum_base64 = codecs.encode( - codecs.decode(checksum, "hex"), "base64" - ).decode() - retval = get_presigned_upload_url( - filepath, checksum_base64, 600, content_length=size - ) + checksum_base64 = hex_to_base64(checksum) + + if resumable and getattr(default_storage, "supports_resumable", False): + # Resumable response omits mimetype/might_skip. + stored = get_stored_object_md5_b64(filepath) == checksum_base64.strip() + retval = { + "resumable": True, + "uploadURL": None + if stored + else create_resumable_upload_session(filepath, checksum_base64, size), + "alreadyUploaded": stored, + } + else: + retval = get_presigned_upload_url( + filepath, checksum_base64, 600, content_length=size + ) + retval["resumable"] = False + retval["might_skip"] = File.objects.filter(checksum=checksum).exists() file = File( file_size=size, @@ -270,8 +291,5 @@ def upload_url(self, request): # Avoid using our file_on_disk attribute for checks file.save(set_by_file_on_disk=False) - retval.update( - {"might_skip": might_skip, "file": self.serialize_object(id=file.id)} - ) - + retval["file"] = self.serialize_object(id=file.id) return Response(retval)