Skip to content

Commit

Permalink
Final runs for this round
Browse files Browse the repository at this point in the history
  • Loading branch information
collijk committed Feb 15, 2025
1 parent 3c65612 commit 4202b5b
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 51 deletions.
28 changes: 16 additions & 12 deletions src/rra_building_density/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,18 @@ class BuiltVersion(BaseModel, abc.ABC):
input_template: str
raw_output_template: str

@property
def name(self) -> str:
return f"{self.provider}_v{self.version}"

@abc.abstractmethod
def process_resources(self, resolution: str) -> tuple[str, str]:
raise NotImplementedError

@property
def name(self) -> str:
return f"{self.provider}_{self.version}"


class MicrosoftVersion(BuiltVersion):
provider: Literal["microsoft"] = "microsoft"
version: Literal["2", "3", "4", "5"]
version: Literal["v2", "v3", "v4", "v5", "water_mask"]

def process_resources(self, resolution: str) -> tuple[str, str]:
return {
Expand All @@ -52,33 +52,39 @@ def process_resources(self, resolution: str) -> tuple[str, str]:

MICROSOFT_VERSIONS = {
"2": MicrosoftVersion(
version="2",
version="v2",
time_points=[
f"{y}q{q}" for q, y in itertools.product(range(1, 5), range(2018, 2024))
][:-1],
input_template="predictions/{time_point}/predictions/postprocess_v2/*",
raw_output_template="{time_point}/{time_point}_{tile_key}.tif",
),
"3": MicrosoftVersion(
version="3",
version="v3",
time_points=["2023q3"],
input_template="predictions/{time_point}/predictions/ensemble_v3_pp/*",
raw_output_template="{time_point}/{time_point}_{tile_key}.tif",
),
"4": MicrosoftVersion(
version="4",
version="v4",
time_points=["2023q4"],
input_template="predictions/{time_point}/predictions/v45_ensemble/*",
raw_output_template="{time_point}/{tile_key}.tif",
),
"5": MicrosoftVersion(
version="5",
version="v5",
time_points=[
f"{y}q{q}" for q, y in itertools.product(range(1, 5), range(2020, 2024))
][2:],
input_template="predictions/{time_point}/az_8_ensemble/*",
raw_output_template="{time_point}/{tile_key}.tif",
),
"water_mask": MicrosoftVersion(
version="water_mask",
time_points=[""],
input_template="permanent_or_seasonal_water/*",
raw_output_template="{tile_key}.tif",
),
}


Expand All @@ -92,7 +98,6 @@ class GHSLVersion(BuiltVersion):

provider: Literal["ghsl"] = "ghsl"
version: Literal["r2023a"]
raw_time_points: list[str]

def prefix_and_measure(self, raw_measure: str) -> tuple[str, str]:
return self.measure_map[raw_measure]
Expand All @@ -107,10 +112,9 @@ def process_resources(self, resolution: str) -> tuple[str, str]:
GHSL_VERSIONS = {
"r2023a": GHSLVersion(
version="r2023a",
raw_time_points=[str(y) for y in range(1975, 2035, 5)],
time_points=[f"{y}q1" for y in range(1975, 2030, 5)],
input_template="GHS_{measure_prefix}_GLOBE_R2023A/GHS_{measure}_E{year}_GLOBE_R2023A_4326_3ss/V1-0/GHS_{measure}_E{year}_GLOBE_R2023A_4326_3ss_V1_0.zip",
raw_output_template="GHS_{measure}_E{year}_GLOBE_R2023A_4326_3ss_V1_0.tif",
raw_output_template="{time_point}/GHS_{measure}_E{year}_GLOBE_R2023A_4326_3ss_V1_0.tif",
),
}

Expand Down
9 changes: 5 additions & 4 deletions src/rra_building_density/extract/ghsl.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,13 @@ def download_ghsl_zipfile(url: str, out_zipfile: Path, *, progress_bar: bool) ->

def extract_ghsl_main(
raw_measure: str,
year: str,
time_point: str,
output_dir: str,
*,
progress_bar: bool,
) -> None:
ghsl_version = bdc.GHSL_VERSIONS["r2023a"]
time_point = f"{year}q1"
year = int(time_point[:4])
measure_prefix, measure = ghsl_version.prefix_and_measure(raw_measure)
template_kwargs = {
"measure_prefix": measure_prefix,
Expand All @@ -57,7 +57,8 @@ def extract_ghsl_main(
download_ghsl_zipfile(url, out_zipfile, progress_bar=progress_bar)

print("Extracting GHSL data...")
out_file = ghsl_version.raw_output_template.format(**template_kwargs)
# Time point is already included in the output_root
out_file = ghsl_version.raw_output_template.format(**template_kwargs).split("/")[-1]
with zipfile.ZipFile(out_zipfile, "r") as zip_ref:
zip_ref.extract(out_file, output_root)

Expand Down Expand Up @@ -94,7 +95,7 @@ def extract_ghsl(
bd_data = BuildingDensityData(output_dir)

ghsl_version = bdc.GHSL_VERSIONS["r2023a"]
time_points = clio.convert_choice(time_point, ghsl_version.raw_time_points)
time_points = clio.convert_choice(time_point, ghsl_version.time_points)

jobmon.run_parallel(
task_name="ghsl",
Expand Down
4 changes: 3 additions & 1 deletion src/rra_building_density/extract/microsoft.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,9 @@ def extract_microsoft_tiles_main(
input_stem = msft_version.input_template.format(time_point=time_point)
input_root = f"{blob_url}/{input_stem}?{blob_key}"

output_root = bd_data.provider_root(msft_version) / time_point
output_root = bd_data.provider_root(msft_version)
if time_point:
output_root = output_root / time_point
mkdir(output_root, exist_ok=True, parents=True)

overwrite_flag = "true" if overwrite else "false"
Expand Down
46 changes: 12 additions & 34 deletions src/rra_building_density/process/ghsl.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def format_ghsl_main(
tile_index = bd_data.load_tile_index(resolution)
tile_index_info = bd_data.load_tile_index_info(resolution)

print("Selecting tile and building template")
print("Building template")
block_index = tile_index[tile_index.block_key == block_key]
block_poly_series = block_index.dissolve("block_key").geometry
block_poly = block_poly_series.iloc[0]
Expand All @@ -38,36 +38,17 @@ def format_ghsl_main(
crs=bdc.CRSES["equal_area"],
)

print("Selecting year weight")
year = int(time_point[:4])
start = year - year % 5
if year % 5 == 0:
end = start
w = 1.0
else:
end = start + 5
t = float(time_point[:4]) + float(time_point[-1:]) / 4
w = (t - start) / (end - start)

print("loading start tile")
start_tile = bd_data.load_provider_tile(
ghsl_version,
bounds=block_poly_ghsl,
measure=ghsl_measure,
year=str(start),
)
start_tile = start_tile.astype(np.float32) / 10000.0
print("loading end tile")
end_tile = bd_data.load_provider_tile(
print("Loading GHSL data")
raw_tile = bd_data.load_provider_tile(
ghsl_version,
bounds=block_poly_ghsl,
measure=ghsl_measure,
year=str(end),
time_point=time_point,
year=time_point[:4],
)
end_tile = end_tile.astype(np.float32) / 10000.0
raw_tile = raw_tile.astype(np.float32) / 10000.0

print("Resampling")
raw_tile = start_tile * (1 - w) + end_tile * w
tile = raw_tile.set_no_data_value(np.nan).resample_to(block_template, "average")
tile = utils.suppress_noise(tile)
print("Saving")
Expand All @@ -82,6 +63,7 @@ def format_ghsl_main(


@click.command() # type: ignore[arg-type]
@clio.with_measure(bdc.GHSLVersion.measure_map)
@clio.with_block_key()
@clio.with_time_point()
@clio.with_resolution(bdc.RESOLUTIONS)
Expand All @@ -105,12 +87,14 @@ def format_ghsl_task(
@clio.with_queue()
def format_ghsl(
measure: list[str],
time_point: list[str],
time_point: str,
resolution: str,
output_dir: str,
queue: str,
) -> None:
"""Format GHSL building density data."""
ghsl_version = bdc.GHSL_VERSIONS["r2023a"]
time_points = clio.convert_choice(time_point, ghsl_version.time_points)
bd_data = BuildingDensityData(output_dir)

print("Loading the tile index")
Expand All @@ -119,13 +103,7 @@ def format_ghsl(
njobs = len(block_keys) * len(time_point) * len(measure)
print(f"Formating building density for {njobs} block-times")

memory, runtime = {
"40": ("8G", "20m"),
"100": ("8G", "20m"),
"250": ("15G", "20m"),
"500": ("60G", "20m"),
"1000": ("250G", "45m"),
}[resolution]
memory, runtime = ghsl_version.process_resources(resolution)

jobmon.run_parallel(
task_name="ghsl",
Expand All @@ -137,7 +115,7 @@ def format_ghsl(
node_args={
"block-key": block_keys,
"measure": measure,
"time-point": time_point,
"time-point": time_points,
},
task_resources={
"queue": queue,
Expand Down
15 changes: 15 additions & 0 deletions src/rra_building_density/process/microsoft.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from rra_building_density import utils
from rra_building_density.data import BuildingDensityData

USE_WATER_MASK = False


def format_microsoft_main(
block_key: str,
Expand Down Expand Up @@ -63,6 +65,7 @@ def format_microsoft_main(
bd_tile = bd_data.load_provider_tile(
msft_version, tile_key=tile_key, time_point=time_point
)

# The resolution of the MSFT tiles has too many decimal points.
# This causes tiles slightly west of the antimeridian to cross
# over and really mucks up reprojection. We'll clip the values
Expand All @@ -80,6 +83,14 @@ def format_microsoft_main(
f=ymax,
)
bd_tile = bd_tile.unset_no_data_value().set_no_data_value(np.nan)
if USE_WATER_MASK:
# mask out water
mask_version = bdc.MICROSOFT_VERSIONS["water_mask"]
mask = bd_data.load_provider_tile(
mask_version, tile_key=tile_key
).to_numpy()
bd_tile._ndarray[mask] = np.nan # noqa: SLF001

reprojected_tile = bd_tile.reproject(
dst_resolution=block_template.x_resolution,
dst_crs=block_template.crs,
Expand Down Expand Up @@ -131,6 +142,10 @@ def format_microsoft(
queue: str,
) -> None:
"""Format Microsoft building density data."""
if version == "water_mask":
msg = "Formatting can't be run on water mask"
raise NotImplementedError(msg)

msft_version = bdc.MICROSOFT_VERSIONS[version]
time_points = clio.convert_choice(time_point, msft_version.time_points)

Expand Down

0 comments on commit 4202b5b

Please sign in to comment.