Skip to content

Commit

Permalink
Merge pull request #3679 from bensteinberg/user-upload-wacz
Browse files Browse the repository at this point in the history
Save user uploads as WACZs
  • Loading branch information
bensteinberg authored Jan 9, 2025
2 parents 8921495 + 010007c commit 4dfa0f0
Show file tree
Hide file tree
Showing 8 changed files with 376 additions and 63 deletions.
11 changes: 4 additions & 7 deletions perma_web/api/tests/test_link_authorization.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,14 +147,10 @@ def test_should_allow_user_to_patch_with_file(self):
# capture were properly associated with actual web archive files, which is always
# the case outside of tests
self.link.archive_timestamp = timezone.now() + timedelta(1)
self.link.warc_size = 1
self.link.wacz_size = 1
self.link.save()

# This link has a warc and a wacz
self.link.refresh_from_db()
self.assertTrue(self.link.warc_size)
self.assertTrue(self.link.wacz_size)

old_primary_capture = self.link.primary_capture

Expand All @@ -167,11 +163,12 @@ def test_should_allow_user_to_patch_with_file(self):
data={'file':file_content})

self.assertTrue(Capture.objects.filter(link_id=self.link.pk, role='primary').exclude(pk=old_primary_capture.pk).exists())
self.assertTrue(Capture.objects.filter(link_id=self.link.pk, role='provenance_summary').exists())

# This link now only has a warc, but not a wacz
self.link.refresh_from_db()
self.assertTrue(self.link.warc_size)
self.assertFalse(self.link.wacz_size)

self.assertTrue(self.link.wacz_size)
self.assertTrue(self.link.wacz_size != 1)


def test_should_reject_patch_with_file_for_out_of_window_link(self):
Expand Down
11 changes: 5 additions & 6 deletions perma_web/api/tests/test_link_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def setUp(self):
'private_reason',
]

def assertRecordsInArchive(self, link, upload=False, expected_records=None, check_screenshot=False, check_provenance_summary=False, filetype='wacz'):
def assertRecordsInArchive(self, link, upload=False, expected_records=None, check_screenshot=False, check_provenance_summary=False):

def find_recording_in_warc(index, capture_url, content_type):
warc_content_type = "application/http; msgtype=response"
Expand Down Expand Up @@ -129,8 +129,7 @@ def find_attachment_in_warc(index, capture_url):
self.assertTrue(link.primary_capture.content_type, "Capture is missing a content type.")

# create an index of the warc
extract = filetype == 'wacz'
with link.get_warc(extract) as warc_file:
with link.get_warc() as warc_file:
index = index_warc_file(warc_file)

# see if the index reports the content is in the warc
Expand Down Expand Up @@ -655,7 +654,7 @@ def test_should_create_archive_from_pdf_file(self):
user=self.org_user)

link = Link.objects.get(guid=obj['guid'])
self.assertRecordsInArchive(link, upload=True, filetype='warc')
self.assertRecordsInArchive(link, upload=True)
self.assertEqual(link.primary_capture.user_upload, True)

def test_should_create_archive_from_jpg_file(self):
Expand All @@ -666,7 +665,7 @@ def test_should_create_archive_from_jpg_file(self):
user=self.org_user)

link = Link.objects.get(guid=obj['guid'])
self.assertRecordsInArchive(link, upload=True, filetype='warc')
self.assertRecordsInArchive(link, upload=True)
self.assertEqual(link.primary_capture.user_upload, True)

def test_should_reject_jpg_file_with_invalid_url(self):
Expand All @@ -687,7 +686,7 @@ def test_should_should_create_archive_from_jpg_file_with_nonloading_url(self):

link = Link.objects.get(guid=obj['guid'])
self.assertEqual(link.submitted_url, 'http://asdf.asdf')
self.assertRecordsInArchive(link, upload=True, filetype='warc')
self.assertRecordsInArchive(link, upload=True)
self.assertEqual(link.primary_capture.user_upload, True)

def test_should_reject_invalid_file(self):
Expand Down
2 changes: 1 addition & 1 deletion perma_web/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,7 @@ def patch(self, request, guid, format=None):
link.mark_capturejob_superseded()

# write new warc and capture
link.write_uploaded_file(uploaded_file, cache_break=True)
link.write_uploaded_file(uploaded_file)

# update internet archive if privacy changes
if 'is_private' in data and was_private != bool(data.get("is_private")) and link.is_permanent():
Expand Down
62 changes: 39 additions & 23 deletions perma_web/perma/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,12 @@
first_day_of_next_month,
pp_date_from_post,
prep_for_perma_payments,
preserve_perma_warc,
preserve_perma_wacz,
process_perma_payments_transmission,
protocol,
remove_control_characters,
today_next_year,
tz_datetime,
write_resource_record_from_asset,
)

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -1974,9 +1973,9 @@ def get_pages_jsonl(self):
)
return "\n".join([json.dumps(row) for row in jsonl_rows])

def write_uploaded_file(self, uploaded_file, cache_break=False):
def write_uploaded_file(self, uploaded_file):
"""
Given a file uploaded by a user, create a Capture record and warc.
Given a file uploaded by a user, create a Capture record and WACZ.
"""
from api.utils import get_mime_type, mime_type_lookup # local import to avoid circular import

Expand All @@ -1985,27 +1984,44 @@ def write_uploaded_file(self, uploaded_file, cache_break=False):
file_name = f'upload.{mime_type_lookup[mime_type]["new_extension"]}'
warc_url = f"file:///{self.guid}/{file_name}"

# append a random number to warc_url if we're replacing a file, to avoid browser cache
if cache_break:
r = random.SystemRandom()
warc_url += f"?version={str(r.random()).replace('.', '')}"

capture = Capture(link=self,
role='primary',
status='success',
record_type='resource',
user_upload='True',
content_type=mime_type,
url=warc_url)
warc_size = [] # pass a mutable container to the context manager, so that it can populate it with the size of the finished warc
with preserve_perma_warc(self.guid, self.creation_timestamp, self.warc_storage_file(), warc_size) as warc:
uploaded_file.file.seek(0)
write_resource_record_from_asset(uploaded_file.file.read(), warc_url, mime_type, warc)
upload_capture = Capture(
link=self,
role='primary',
status='success',
record_type='resource',
user_upload=True,
content_type=mime_type,
url=warc_url
)

provenance_capture = Capture(
link=self,
role='provenance_summary',
status='success',
record_type='resource',
user_upload=False,
content_type='text/html',
url='file:///provenance-summary.html'
)

# make the WACZ
self.wacz_size = preserve_perma_wacz(
uploaded_file,
warc_url,
mime_type,
self.guid,
self.submitted_url,
self.submitted_title,
self.creation_timestamp,
self.wacz_storage_file()
)
self.warc_size = 0 # necessary?

self.captured_by_software = 'upload'
self.captured_by_browser = None
self.warc_size = warc_size[0]
self.save(update_fields=['captured_by_software', 'captured_by_browser', 'warc_size'])
capture.save()
self.save(update_fields=['captured_by_software', 'captured_by_browser', 'warc_size', 'wacz_size'])
upload_capture.save()
provenance_capture.save()

def safe_delete_warc(self):
old_name = self.warc_storage_file()
Expand Down
4 changes: 4 additions & 0 deletions perma_web/perma/settings/deployments/settings_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,6 +625,10 @@
# Before deployment, we suppress the addition of new capture jobs when this file is present
DEPLOYMENT_SENTINEL = '/tmp/perma-deployment-pending'

# for inclusion in datapackage.json for user uploads; to be replaced with a
# short commit hash in deployments
PERMA_VERSION = 'dev'

# Which settings should be available in all Django templates,
# without needing to explicitly pass them via the view?
TEMPLATE_VISIBLE_SETTINGS = (
Expand Down
103 changes: 103 additions & 0 deletions perma_web/perma/templates/provenance-summary.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
<!DOCTYPE html>
<html lang="en">
<head>
<title>Provenance summary for user upload for {{url}}</title>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />

<style>
* {
padding: 0px;
margin: 0px;
box-sizing: border-box;
}

html {
font-size: 16px;
}

video {
max-width: 100%;
}

main {
width: 100%;
padding: 1rem;
margin: auto;
max-width: 90ch;
}

section {
margin-bottom: 1rem;
padding-top: 1rem;
border-top: 1px solid gray;
}

h1, h2 {
margin-bottom: 0.5rem;
}

p {
font-size: 1rem;
line-height: 1.5rem;
margin-bottom: 0.5rem;
}

p span {
display: inline-block;
background-color: antiquewhite;
padding: 0.2rem;
padding-left: 0.35rem;
padding-right: 0.35rem;
border-radius: 0.25rem;
}

ul {
list-style-position: inside;
}

table {
table-layout: fixed;
border-collapse: collapse;
width: 100%;
text-align: left;
}

table * {
word-break: break-word;
}

table tr {
border-bottom: 1px solid lightgray;
}

table tr td, table tr th {
padding: 0.75rem 0.25rem;
}

table tr td:first-of-type {
min-width: 34ch;
}

table tr:last-of-type {
border-bottom: 0px;
}
</style>

</head>

<body>

<main>

<header>
<h1>Provenance Summary</h1>
<p>The data present in this capture, with MIME type <code>{{ mime_type }}</code>, were uploaded by a Perma user at {{ now }} to replace a failed or unsatisfactory capture of {{ url }} at {{ creation_timestamp }}.</p>
</header>

</main>

</body>

</html>

Loading

0 comments on commit 4dfa0f0

Please sign in to comment.