|
import os |
|
import subprocess |
|
import sys |
|
import argparse |
|
from concurrent.futures import ( |
|
ProcessPoolExecutor, |
|
as_completed, |
|
) |
|
from zipnn_compress_file import compress_file |
|
import zipnn |
|
|
|
sys.path.append( |
|
os.path.abspath( |
|
os.path.join( |
|
os.path.dirname(__file__), ".." |
|
) |
|
) |
|
) |
|
|
|
|
|
KB = 1024 |
|
MB = 1024 * 1024 |
|
GB = 1024 * 1024 * 1024 |
|
|
|
|
|
def check_and_install_zipnn(): |
|
try: |
|
import zipnn |
|
except ImportError: |
|
print("zipnn not found. Installing...") |
|
subprocess.check_call( |
|
[ |
|
sys.executable, |
|
"-m", |
|
"pip", |
|
"install", |
|
"zipnn", |
|
"--upgrade", |
|
] |
|
) |
|
import zipnn |
|
|
|
|
|
def parse_streaming_chunk_size( |
|
streaming_chunk_size, |
|
): |
|
if str(streaming_chunk_size).isdigit(): |
|
final = int(streaming_chunk_size) |
|
else: |
|
size_value = int( |
|
streaming_chunk_size[:-2] |
|
) |
|
size_unit = streaming_chunk_size[ |
|
-2 |
|
].lower() |
|
|
|
if size_unit == "k": |
|
final = KB * size_value |
|
elif size_unit == "m": |
|
final = MB * size_value |
|
elif size_unit == "g": |
|
final = GB * size_value |
|
else: |
|
raise ValueError( |
|
f"Invalid size unit: {size_unit}. Use 'k', 'm', or 'g'." |
|
) |
|
|
|
return final |
|
|
|
|
|
def compress_files_with_suffix( |
|
suffix, |
|
dtype="", |
|
streaming_chunk_size=1048576, |
|
path=".", |
|
delete=False, |
|
r=False, |
|
force=False, |
|
max_processes=1, |
|
): |
|
|
|
file_list = [] |
|
streaming_chunk_size = ( |
|
parse_streaming_chunk_size( |
|
streaming_chunk_size |
|
) |
|
) |
|
directories_to_search = ( |
|
os.walk(path) |
|
if r |
|
else [(path, [], os.listdir(path))] |
|
) |
|
files_found = False |
|
for root, _, files in directories_to_search: |
|
for file_name in files: |
|
if file_name.endswith(suffix): |
|
compressed_path = ( |
|
file_name + ".znn" |
|
) |
|
if not force and os.path.exists( |
|
compressed_path |
|
): |
|
user_input = ( |
|
input( |
|
f"{compressed_path} already exists; overwrite (y/n)? " |
|
) |
|
.strip() |
|
.lower() |
|
) |
|
if user_input not in ( |
|
"y", |
|
"yes", |
|
): |
|
print( |
|
f"Skipping {file_name}..." |
|
) |
|
continue |
|
files_found = True |
|
full_path = os.path.join( |
|
root, file_name |
|
) |
|
file_list.append(full_path) |
|
|
|
with ProcessPoolExecutor( |
|
max_workers=max_processes |
|
) as executor: |
|
future_to_file = { |
|
executor.submit( |
|
compress_file, |
|
file, |
|
dtype, |
|
streaming_chunk_size, |
|
delete, |
|
True, |
|
): file |
|
for file in file_list[:max_processes] |
|
} |
|
file_list = file_list[max_processes:] |
|
while future_to_file: |
|
for future in as_completed( |
|
future_to_file |
|
): |
|
file = future_to_file.pop(future) |
|
|
|
try: |
|
future.result() |
|
except Exception as exc: |
|
print( |
|
f"File {file} generated an exception: {exc}" |
|
) |
|
|
|
if file_list: |
|
next_file = file_list.pop(0) |
|
future_to_file[ |
|
executor.submit( |
|
compress_file, |
|
next_file, |
|
dtype, |
|
streaming_chunk_size, |
|
delete, |
|
True, |
|
) |
|
] = next_file |
|
|
|
if not files_found: |
|
print( |
|
f"No files with the suffix '{suffix}' found." |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
if len(sys.argv) < 2: |
|
print( |
|
"Usage: python compress_files.py <suffix>" |
|
) |
|
print( |
|
"Example: python compress_files.py 'safetensors'" |
|
) |
|
sys.exit(1) |
|
|
|
parser = argparse.ArgumentParser( |
|
description="Enter a suffix to compress, (optional) dtype, (optional) streaming chunk size, (optional) path to files." |
|
) |
|
parser.add_argument( |
|
"suffix", |
|
type=str, |
|
help="Specify the file suffix to compress all files with that suffix. If a single file name is provided, only that file will be compressed.", |
|
) |
|
parser.add_argument( |
|
"--float32", |
|
action="store_true", |
|
help="A flag that triggers float32 compression", |
|
) |
|
parser.add_argument( |
|
"--streaming_chunk_size", |
|
type=str, |
|
help="An optional streaming chunk size. The format is int (for size in Bytes) or int+KB/MB/GB. Default is 1MB", |
|
) |
|
parser.add_argument( |
|
"--path", |
|
type=str, |
|
help="Path to files to compress", |
|
) |
|
parser.add_argument( |
|
"--delete", |
|
action="store_true", |
|
help="A flag that triggers deletion of a single file instead of compression", |
|
) |
|
parser.add_argument( |
|
"-r", |
|
action="store_true", |
|
help="A flag that triggers recursive search on all subdirectories", |
|
) |
|
parser.add_argument( |
|
"--recursive", |
|
action="store_true", |
|
help="A flag that triggers recursive search on all subdirectories", |
|
) |
|
parser.add_argument( |
|
"--force", |
|
action="store_true", |
|
help="A flag that forces overwriting when compressing.", |
|
) |
|
parser.add_argument( |
|
"--max_processes", |
|
type=int, |
|
help="The amount of maximum processes.", |
|
) |
|
args = parser.parse_args() |
|
optional_kwargs = {} |
|
if args.float32: |
|
optional_kwargs["dtype"] = 32 |
|
if args.streaming_chunk_size is not None: |
|
optional_kwargs[ |
|
"streaming_chunk_size" |
|
] = args.streaming_chunk_size |
|
if args.path is not None: |
|
optional_kwargs["path"] = args.path |
|
if args.delete: |
|
optional_kwargs["delete"] = args.delete |
|
if args.r or args.recursive: |
|
optional_kwargs["r"] = args.r |
|
if args.force: |
|
optional_kwargs["force"] = args.force |
|
if args.max_processes: |
|
optional_kwargs["max_processes"] = ( |
|
args.max_processes |
|
) |
|
|
|
check_and_install_zipnn() |
|
compress_files_with_suffix( |
|
args.suffix, **optional_kwargs |
|
) |
|
|