From 81018f06fa15517917c4c6e52d0212ca669b35dc Mon Sep 17 00:00:00 2001 From: Aaryan Kumar Sinha Date: Sat, 13 Dec 2025 01:30:33 +0530 Subject: [PATCH] Added --overwrite flag to zimit --- CHANGELOG.md | 3 ++ offliner-definition.json | 6 +++ src/zimit/zimit.py | 6 ++- tests/conftest.py | 14 ++++++ tests/data/example-response.warc | Bin 0 -> 2272 bytes tests/test_overwrite.py | 83 +++++++++++++++++++++++++++++++ 6 files changed, 110 insertions(+), 2 deletions(-) create mode 100644 tests/conftest.py create mode 100644 tests/data/example-response.warc create mode 100644 tests/test_overwrite.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 58fb40a..2a99b30 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- Added `--overwrite` flag to overwrite existing ZIM file if it exists (#399) + ### Changed - Fix issues preventing interrupted crawls from being resumed. (#499) - Ensure build directory is used explicitly instead of a randomized subdirectory when passed, and pre-create it if it does not exist. diff --git a/offliner-definition.json b/offliner-definition.json index 4000466..4bb68b5 100644 --- a/offliner-definition.json +++ b/offliner-definition.json @@ -970,6 +970,12 @@ "alias": "name", "pattern": "^([a-z0-9\\-\\.]+_)([a-z\\-]+_)([a-z0-9\\-\\.]+)$", "relaxedPattern": "^[A-Za-z0-9._-]+$" + }, + "overwrite": { + "type": "boolean", + "required": false, + "title": "Overwrite", + "description": "Whether to overwrite existing ZIM file if it exists" } } } diff --git a/src/zimit/zimit.py b/src/zimit/zimit.py index e982cbd..b205007 100755 --- a/src/zimit/zimit.py +++ b/src/zimit/zimit.py @@ -849,6 +849,9 @@ def run(raw_args): warc2zim_args.append("--lang") warc2zim_args.append(known_args.zim_lang) + if known_args.overwrite: + warc2zim_args.append("--overwrite") + logger.info("----------") logger.info("Testing warc2zim args") logger.info("Running: warc2zim " + " ".join(warc2zim_args)) @@ -1036,7 +1039,6 @@ def run(raw_args): warc_files.append(Path(extract_path)) else: - logger.info(f"Running browsertrix-crawler crawl: {cmd_line}") crawl = subprocess.run(crawler_args, check=False) if ( @@ -1091,7 +1093,7 @@ def run(raw_args): logger.info("----------") logger.info( f"Processing WARC files in/at " - f'{" ".join(str(warc_file) for warc_file in warc_files)}' + f"{' '.join(str(warc_file) for warc_file in warc_files)}" ) warc2zim_args.extend(str(warc_file) for warc_file in warc_files) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..d51650d --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,14 @@ +import pytest + +from zimit import zimit as app + +""" + cleanup disabled because atexit hooks run at the very end of the Python process + shutdown. By the time cleanup() is called, the logging module has already closed its + file streams. +""" + + +@pytest.fixture(autouse=True) +def disable_zimit_cleanup(monkeypatch): + monkeypatch.setattr(app, "cleanup", lambda: None) diff --git a/tests/data/example-response.warc b/tests/data/example-response.warc new file mode 100644 index 0000000000000000000000000000000000000000..143b947d121e61b479cf9cae596b3853a9a60633 GIT binary patch literal 2272 zcmb`I`9Bkk1IJ13mB}8VF(i@kY%^vP#gc21YjZ|(tKqG9zOV21Klpw>KYxDuyxy<(2SO1MX-n+EA32QhCSDKm^z*;tt%-oa zbzumE4h)IVMxfD1Jx$oZqp70_gTvu^a6R;OZ!|(H;-5YrE|U`=pae?&C9rW^Ya*p1 zbjT~mR!ua~n4kol@A;&6KjUFCyxAaDN7gUAWNv-;u#?O2K?`&&1yHTSX2yZ~do0BF z1v%l+=Di-I;tyLt)05F<6`6pGKI@`J(9hMq1{(0}2gk8{zb>Pdn ztf2h@%$|g4W_!Wub6PgVBlKrY&?+_I1R=Rwg_rSHm)Ep2@w(m?{$*hGDov_TiqXL} z0tcpoDm71%@65h^YyJA;--K40^~0$xbQB6a&<_M!xeX7W*;F6p>U2!VxOy1^y$a5Q z-pjO~%c9tE?}km)g1?1MA5sn`*w z$lKUUV=a7qe20eud{_YdRm1V!X&#}%_2<; zXY$Dd(Rrsz$A=p4{*@O+`xQl->mGC&Kt>|U@w4i`4>?aKf|ZRw&fGDFxi((>swq|x zEJ-enG&wo9g*lq8WCo_E$T*v2AT1<;SSM-eXF!jHexp|6`4&aoOOY-yjl?Oy14%U_ z7x;tIX6YVO19|aFr*itiluYb0eR&EgH!r|eIE(wiWj}%a%kT15Qt`cPy}M_!gXME^ z-*5=?N(cWyD!?to@xoeU!EuGI<4jAb=T8aHF|S$QzZ%#@x?aXqtJfMaNU0)v-@=d4 zVhUJS?IT!ujK6MK%=`r3gMMQ{Q`-za=tu1&>2F7R1@bzSQU+6QK5he86a*^%nVmee zZlYH*D|*?IPYec9Bf?!h)a3;k$h*MgqrHm>&Sm&Ams+1QAlFv_EKas}NY1s%CFspf zFi|5sG6(Z*1$gY>LEu?3NB|;sGsrO61_RhkBY$gXypsfgb@9Hoh>$adRqN~4Qg zl_)0UBp=DsOvUiy`T2mO49{tc%+KMV`7?C|dMl|0gd$8u(Qm&XJ+++rn_k~PGgMojC`zD!i@ z6PGKwA7;NuQ2=@C$oRj}X)uO2txn)@yLoe1YAGD@@j;}z) zb<{ILaFg#+BRTs7{n=Ng7}lyrUF>w~f#n2oN$klv((};%fk97HV_I}_!G=+MwVqac zXinZrw95%0>+%D}#`Mwxwx#h`z5a+FBG%+0A{ff1h zKR>m03#_$;o-QX4m(;D+HryntdXKkob^StIv-Z|`I^t2mx5lfc79i2WC#rOFYs{dI zg|<)8ih=r}UOoqvsUxlv)w0U;xHvmDIzI6k{LKK0bd4?1y_;Q9i%t|4)6CQtB6gU?})Q-i!JMs0gDfL|e|fCnnTq)tcbTzjc4<%Y+EE z#ni*By(?Y{UA8f=x(Iv?)b53Q5t8Yr?andDyRP*7EgFyQH42e zuacpvQqzv4Y={{0&aenFiM`WqwXlHV`sOAP(2mztg{DvB`LyC!M@VAyM7 zS14cud+eaoBLn0xaRwvtMs`zsRAdHj3)HX^$@~-ZAOq9ancd24=pmnCrrvma*d4A#_O6AbcZTEp5`+$D98#-XBzbn{k>%2>;U*#I@X_s6KHew>eu=87 RH3|9e>Fz|Z3l0B4?{AQ;Bs>5B literal 0 HcmV?d00001 diff --git a/tests/test_overwrite.py b/tests/test_overwrite.py new file mode 100644 index 0000000..e41baca --- /dev/null +++ b/tests/test_overwrite.py @@ -0,0 +1,83 @@ +import pathlib + +import pytest + +from zimit.zimit import run + +TEST_DATA_DIR = pathlib.Path(__file__).parent / "data" + + +def test_overwrite_flag_behaviour(tmp_path): + zim_output = "overwrite-test.zim" + output_path = tmp_path / zim_output + + # 1st run → creates file + result = run( + [ + "--seeds", + "https://example.com", + "--warcs", + str(TEST_DATA_DIR / "example-response.warc"), + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + ] + ) + assert result in (None, 100) + assert output_path.exists() + + # 2nd run, no overwrite → should fail + with pytest.raises(SystemExit) as exc: + run( + [ + "--seeds", + "https://example.com", + "--warcs", + str(TEST_DATA_DIR / "example-response.warc"), + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + ] + ) + assert exc.value.code == 2 + + # 2nd run, no overwrite → should fail + with pytest.raises(SystemExit) as exc: + run( + [ + "--seeds", + "https://example.com", + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + ] + ) + assert exc.value.code == 2 + + # 3rd run, with overwrite → should succeed + result = run( + [ + "--seeds", + "https://example.com", + "--warcs", + str(TEST_DATA_DIR / "example-response.warc"), + "--output", + str(tmp_path), + "--zim-file", + zim_output, + "--name", + "overwrite-test", + "--overwrite", + ] + ) + assert result in (None, 100) + assert output_path.exists()