Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add binary sufixes to job tools memory specification (e.g. 1GiB vs 1GB) #2858

Merged
merged 1 commit into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 36 additions & 10 deletions src/spikeinterface/core/core_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,6 @@
from math import prod

import numpy as np
from tqdm import tqdm

from .job_tools import (
ensure_chunk_size,
ensure_n_jobs,
divide_segment_into_chunks,
fix_job_kwargs,
ChunkRecordingExecutor,
_shared_job_kwargs_doc,
)


def define_function_from_class(source_class, name):
Expand Down Expand Up @@ -447,6 +437,42 @@ def convert_bytes_to_str(byte_value: int) -> str:
return f"{byte_value:.2f} {suffixes[i]}"


_exponents = {
"k": 1e3,
"M": 1e6,
"G": 1e9,
"T": 1e12,
"P": 1e15, # Decimal (SI) prefixes
"Ki": 1024**1,
"Mi": 1024**2,
"Gi": 1024**3,
"Ti": 1024**4,
"Pi": 1024**5, # Binary (IEC) prefixes
}


def convert_string_to_bytes(memory_string: str) -> int:
"""
Convert a memory size string to the corresponding number of bytes.

Parameters:
mem (str): Memory size string (e.g., "1G", "512Mi", "2T").

Returns:
int: Number of bytes.
"""
if memory_string[-2:] in _exponents:
suffix = memory_string[-2:]
mem_value = memory_string[:-2]
else:
suffix = memory_string[-1]
mem_value = memory_string[:-1]

assert suffix in _exponents, f"Unknown suffix: {suffix}"
bytes_value = int(float(mem_value) * _exponents[suffix])
return bytes_value


def is_editable_mode() -> bool:
"""
Check if spikeinterface is installed in editable mode
Expand Down
24 changes: 8 additions & 16 deletions src/spikeinterface/core/job_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
import platform
import os
import warnings
from spikeinterface.core.core_tools import convert_string_to_bytes

import sys
import contextlib
from tqdm.auto import tqdm

from concurrent.futures import ProcessPoolExecutor
Expand All @@ -23,7 +23,7 @@
- chunk_size: int
Number of samples per chunk
- chunk_memory: str
Memory usage for each job (e.g. "100M", "1G")
Memory usage for each job (e.g. "100M", "1G", "500MiB", "2GiB")
- total_memory: str
Total memory usage (e.g. "500M", "2G")
- chunk_duration : str or float or None
Expand Down Expand Up @@ -149,16 +149,6 @@ def divide_recording_into_chunks(recording, chunk_size):
return all_chunks


_exponents = {"k": 1e3, "M": 1e6, "G": 1e9}


def _mem_to_int(mem):
suffix = mem[-1]
assert suffix in _exponents
mem = int(float(mem[:-1]) * _exponents[suffix])
return mem


def ensure_n_jobs(recording, n_jobs=1):
if n_jobs == -1:
n_jobs = os.cpu_count()
Expand Down Expand Up @@ -206,9 +196,11 @@ def ensure_chunk_size(
chunk_size: int or None
size for one chunk per job
chunk_memory: str or None
must end with "k", "M" or "G"
must end with "k", "M", "G", etc for decimal units and "ki", "Mi", "Gi", etc for
binary units. (e.g. "1k", "500M", "2G", "1ki", "500Mi", "2Gi")
Comment on lines +199 to +200
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@h-mayorquin I think that this might make things extra complicated. If I understand correctly,

1KB = 1000 bytes
1KiB = 1024 bytes

Is that correct? I had to look up the difference between decimal/binary for memory representation, so I think that an end user could be a bit puzzled by this.

Happy to discuss!

Copy link
Collaborator Author

@h-mayorquin h-mayorquin May 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that having the two is confusing indeed.

But the current setup is confusing as well. If somebody allocates 1G using the current SpikeInterface machinery (and SpikeInterface was perfect in its handling of memory ...) the memory reported by the system won't match 1G as the user specified.

The reason for this is that RAM memory is reported in binary base. Both when you buy RAM and when you use utilities like system resource, the numbers you see are in binary base. .

Given that memory allocation here is a RAM issue, I think we should follow the convention of RAM specification (binary). However, we have to do things carefully because the decimal base has been there for so long.

Finally, given that using the job_kwargs is a relatively advanced feature, I am far less concerned about this type of confusion and I am fine with having the user specify which one they want to use.

(Hard drive storage, on the other hand, is reported in base 10, both when you calculate the size of an object in the system and when you buy a hard drive, here some link to the discssion on why this might be confusing for hard drives as well: https://superuser.com/questions/1080633/confusion-with-storage-capacity-powers-of-10-and-2)

total_memory: str or None
must end with "k", "M" or "G"
must end with "k", "M", "G", etc for decimal units and "ki", "Mi", "Gi", etc for
binary units. (e.g. "1k", "500M", "2G", "1ki", "500Mi", "2Gi")
chunk_duration: None or float or str
Units are second if float.
If str then the str must contain units(e.g. "1s", "500ms")
Expand All @@ -219,14 +211,14 @@ def ensure_chunk_size(
elif chunk_memory is not None:
assert total_memory is None
# set by memory per worker size
chunk_memory = _mem_to_int(chunk_memory)
chunk_memory = convert_string_to_bytes(chunk_memory)
n_bytes = np.dtype(recording.get_dtype()).itemsize
num_channels = recording.get_num_channels()
chunk_size = int(chunk_memory / (num_channels * n_bytes))
elif total_memory is not None:
# clip by total memory size
n_jobs = ensure_n_jobs(recording, n_jobs=n_jobs)
total_memory = _mem_to_int(total_memory)
total_memory = convert_string_to_bytes(total_memory)
n_bytes = np.dtype(recording.get_dtype()).itemsize
num_channels = recording.get_num_channels()
chunk_size = int(total_memory / (num_channels * n_bytes * n_jobs))
Expand Down
37 changes: 32 additions & 5 deletions src/spikeinterface/core/tests/test_core_tools.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import platform
import math
from multiprocessing.shared_memory import SharedMemory
from pathlib import Path
import importlib
import pytest
import numpy as np

Expand All @@ -12,10 +10,8 @@
make_paths_absolute,
check_paths_relative,
normal_pdf,
convert_string_to_bytes,
)
from spikeinterface.core.binaryrecordingextractor import BinaryRecordingExtractor
from spikeinterface.core.generate import NoiseGeneratorRecording
from spikeinterface.core.numpyextractors import NumpySorting


if hasattr(pytest, "global_test_folder"):
Expand Down Expand Up @@ -88,6 +84,37 @@ def test_path_utils_functions():
# UNC can be relative to the same UNC
assert check_paths_relative(d, r"\\host\share")

def test_convert_string_to_bytes():
# Test SI prefixes
assert convert_string_to_bytes("1k") == 1000
assert convert_string_to_bytes("1M") == 1000000
assert convert_string_to_bytes("1G") == 1000000000
assert convert_string_to_bytes("1T") == 1000000000000
assert convert_string_to_bytes("1P") == 1000000000000000
# Test IEC prefixes
assert convert_string_to_bytes("1Ki") == 1024
assert convert_string_to_bytes("1Mi") == 1048576
assert convert_string_to_bytes("1Gi") == 1073741824
assert convert_string_to_bytes("1Ti") == 1099511627776
assert convert_string_to_bytes("1Pi") == 1125899906842624
# Test mixed values
assert convert_string_to_bytes("1.5k") == 1500
assert convert_string_to_bytes("2.5M") == 2500000
assert convert_string_to_bytes("0.5G") == 500000000
assert convert_string_to_bytes("1.2T") == 1200000000000
assert convert_string_to_bytes("1.5Pi") == 1688849860263936
# Test zero values
assert convert_string_to_bytes("0k") == 0
assert convert_string_to_bytes("0Ki") == 0
# Test invalid inputs (should raise assertion error)
with pytest.raises(AssertionError) as e:
convert_string_to_bytes("1Z")
assert str(e.value) == "Unknown suffix: Z"

with pytest.raises(AssertionError) as e:
convert_string_to_bytes("1Xi")
assert str(e.value) == "Unknown suffix: Xi"


def test_normal_pdf() -> None:
mu = 4.160771
Expand Down
Loading