# This file is part of the Open Data Cube, see https://opendatacube.org for more information
#
# Copyright (c) 2015-2020 ODC Contributors
# SPDX-License-Identifier: Apache-2.0
"""
Write Cloud Optimized GeoTIFFs from xarrays.
"""
from __future__ import annotations
import warnings
from contextlib import contextmanager
from pathlib import Path
from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, Union
from uuid import uuid4
import numpy as np
import rasterio
import xarray as xr
from rasterio.shutil import copy as rio_copy # pylint: disable=no-name-in-module
from ..geobox import GeoBox
from ..math import resolve_nodata
from ..types import Nodata, SomeNodata, SomeShape, shape_, wh_
from ..warp import resampling_s2rio
from ._shared import adjust_blocksize
# pylint: disable=too-many-locals,too-many-branches,too-many-arguments,too-many-statements,too-many-instance-attributes
def _without(cfg: Dict[str, Any], *skip: str) -> Dict[str, Any]:
skip = set(skip)
return {k: v for k, v in cfg.items() if k not in skip}
def check_write_path(fname: Union[Path, str], overwrite: bool) -> Path:
"""
Check path before overwriting.
:param fname: string or Path object
:param overwrite: Whether to remove file when it exists
exists overwrite Action
----------------------------------------------
T T delete file, return Path
T F raise IOError
F T return Path
F F return Path
"""
if not isinstance(fname, Path):
fname = Path(fname)
if fname.exists():
if overwrite:
fname.unlink()
else:
raise IOError("File exists")
return fname
def _default_cog_opts(
*, blocksize: int = 512, shape: SomeShape = (0, 0), is_float: bool = False, **other
) -> Dict[str, Any]:
nx, ny = shape_(shape).xy
return {
"tiled": True,
"blockxsize": adjust_blocksize(blocksize, nx),
"blockysize": adjust_blocksize(blocksize, ny),
"zlevel": 6,
"predictor": 3 if is_float else 2,
"compress": "DEFLATE",
**other,
}
def _norm_compression_opts(
compression: Union[bool, str, Dict[str, Any]],
default_compress: str = "deflate",
default_zlevel: int = 2,
) -> Dict[str, Any]:
if isinstance(compression, bool):
if compression:
return {"compress": default_compress, "zlevel": default_zlevel}
return {"compress": None}
if isinstance(compression, str):
compression = {"compress": compression}
return compression
def _get_gdal_metadata(
xx: xr.DataArray | list[xr.DataArray],
tags: Optional[Dict[str, Any]],
) -> Dict[str, Any]:
"""
Extract metadata from xarray for GDAL.
:param xx: xarray.DataArray
:return: Dictionary of metadata
"""
meta: Dict[str, Any] = {"tags": tags}
if not isinstance(xx, list):
xx = [xx]
# The logic for this is to fill the lists with
# None values if the attribute is not present
# as only some bands may have scale and offset
# and we need to know which ones...
scales, offsets, units = [], [], []
for x in xx:
scales.append(x.attrs.get("scales", None))
offsets.append(x.attrs.get("offsets", None))
units.append(x.attrs.get("units", None))
if any(v is not None for v in scales):
meta["scales"] = scales
if any(v is not None for v in offsets):
meta["offsets"] = offsets
if any(v is not None for v in units):
meta["units"] = units
return meta
def _write_cog(
pix: np.ndarray,
geobox: GeoBox,
fname: Union[Path, str],
nodata: Nodata = None,
overwrite: bool = False,
blocksize: Optional[int] = None,
overview_resampling: Optional[str] = None,
overview_levels: Optional[List[int]] = None,
ovr_blocksize: Optional[int] = None,
use_windowed_writes: bool = False,
intermediate_compression: Union[bool, str, Dict[str, Any]] = False,
gdal_metadata: Optional[Dict[str, Any]] = None,
**extra_rio_opts,
) -> Union[Path, bytes]:
if blocksize is None:
blocksize = 512
if ovr_blocksize is None:
ovr_blocksize = blocksize
if overview_resampling is None:
overview_resampling = "nearest"
intermediate_compression = _norm_compression_opts(intermediate_compression)
if pix.ndim == 2:
h, w = pix.shape
nbands = 1
band = 1 # type: Any
elif pix.ndim == 3:
if pix.shape[:2] == geobox.shape:
pix = pix.transpose([2, 0, 1])
elif pix.shape[-2:] != geobox.shape:
raise ValueError("GeoBox shape does not match image shape")
nbands, h, w = pix.shape # type: ignore
band = tuple(i for i in range(1, nbands + 1))
else:
raise ValueError("Need 2d or 3d ndarray on input")
assert geobox.shape == (h, w)
if overview_levels is None:
if min(w, h) < 512:
overview_levels = []
else:
overview_levels = [2**i for i in range(1, 6)]
if fname != ":mem:":
path = check_write_path(
fname, overwrite
) # aborts if overwrite=False and file exists already
resampling = resampling_s2rio(overview_resampling)
if (blocksize % 16) != 0:
warnings.warn("Block size must be a multiple of 16, will be adjusted")
rio_opts = {
"width": w,
"height": h,
"count": nbands,
"dtype": pix.dtype.name,
"crs": str(geobox.crs) if geobox.crs is not None else None,
"transform": geobox.transform,
**_default_cog_opts(
blocksize=blocksize, shape=wh_(w, h), is_float=pix.dtype.kind == "f"
),
}
if nodata is not None:
rio_opts.update(nodata=nodata)
rio_opts.update(extra_rio_opts)
def _write(pix, band, dst):
if not use_windowed_writes:
dst.write(pix, band)
else:
for _, win in dst.block_windows():
if pix.ndim == 2:
block = pix[win.toslices()]
else:
block = pix[(slice(None),) + win.toslices()]
dst.write(block, indexes=band, window=win)
if gdal_metadata is not None:
# Add scales, offsets and units if present
scales = gdal_metadata.get("scales", None)
if scales is not None:
dst.scales = scales
offsets = gdal_metadata.get("offsets", None)
if offsets is not None:
dst.offsets = offsets
units = gdal_metadata.get("units", None)
if units is not None:
dst.units = units
# Add any tags that are present
tags = gdal_metadata.get("tags", None)
if tags is not None:
dst.update_tags(**tags)
# Deal efficiently with "no overviews needed case"
if len(overview_levels) == 0:
if fname == ":mem:":
with rasterio.MemoryFile() as mem:
with mem.open(driver="GTiff", **rio_opts) as dst:
_write(pix, band, dst)
return bytes(mem.getbuffer())
else:
with rasterio.open(path, mode="w", driver="GTiff", **rio_opts) as dst:
_write(pix, band, dst)
return path
# copy re-compresses anyway so skip compression for temp image
tmp_opts = _without(rio_opts, "compress", "predictor", "zlevel")
tmp_opts.update(intermediate_compression)
with rasterio.Env(GDAL_TIFF_OVR_BLOCKSIZE=ovr_blocksize):
with rasterio.MemoryFile() as mem:
with mem.open(driver="GTiff", **tmp_opts) as tmp:
_write(pix, band, tmp)
tmp.build_overviews(overview_levels, resampling)
if fname == ":mem:":
with rasterio.MemoryFile() as mem2:
rio_copy(
tmp,
mem2.name,
driver="GTiff",
copy_src_overviews=True,
**_without(
rio_opts,
"width",
"height",
"count",
"dtype",
"crs",
"transform",
"nodata",
),
)
return bytes(mem2.getbuffer())
rio_copy(tmp, path, driver="GTiff", copy_src_overviews=True, **rio_opts)
return path
[docs]
def write_cog(
geo_im: xr.DataArray,
fname: Union[str, Path],
*,
overwrite: bool = False,
blocksize: Optional[int] = None,
ovr_blocksize: Optional[int] = None,
overviews: Optional[Iterable[xr.DataArray]] = None,
overview_resampling: Optional[str] = None,
overview_levels: Optional[List[int]] = None,
use_windowed_writes: bool = False,
intermediate_compression: Union[bool, str, Dict[str, Any]] = False,
tags: Optional[Dict[str, Any]] = None,
nodata: SomeNodata = "auto",
**extra_rio_opts,
) -> Union[Path, bytes]:
"""
Save ``xarray.DataArray`` to a file in Cloud Optimized GeoTiff format.
:param geo_im: ``xarray.DataArray`` with crs
:param fname: Output path or ``":mem:"`` in which case compress to RAM and return bytes
:param overwrite: True -- replace existing file, False -- abort with IOError exception
:param blocksize: Size of internal tiff tiles (512x512 pixels)
:param ovr_blocksize: Size of internal tiles in overview images (defaults to blocksize)
:param overviews: Write pre-computed overviews if supplied
:param overview_resampling: Use this resampling when computing overviews
:param overview_levels: List of shrink factors to compute overiews for: [2,4,8,16,32],
to disable overviews supply empty list ``[]``
:param nodata: Set ``nodata`` flag to this value if supplied, by default ``nodata`` is
read from the attributes of the input array (``geo_im.odc.nodata``).
:param use_windowed_writes: Write image block by block (might need this for large images)
:param intermediate_compression: Configure compression settings for first pass write
, default is no compression
:param tags: Dictionary of tags to write into the output file. These are written as
GDAL Metadata items in the GeoTIFF file.
:param extra_rio_opts: Any other option is passed to ``rasterio.open``
:returns: Path to which output was written
:returns: Bytes if ``fname=":mem:"``
.. note ::
**memory requirements**
This function generates a temporary in memory tiff file without
compression to speed things up. It then adds overviews to this file and
only then copies it to the final destination with requested compression
settings. This is necessary to produce a compliant COG, since the COG standard
demands overviews to be placed before native resolution data and double
pass is the only way to achieve this currently.
This means that this function will use about 1.5 to 2 times memory taken by ``geo_im``.
"""
nodata = resolve_nodata(nodata, geo_im.dtype, geo_im.odc.nodata)
if overviews is not None:
layers = [geo_im, *overviews]
result = write_cog_layers(
layers,
fname,
overwrite=overwrite,
blocksize=blocksize,
ovr_blocksize=ovr_blocksize,
use_windowed_writes=use_windowed_writes,
intermediate_compression=intermediate_compression,
tags=tags,
nodata=nodata,
**extra_rio_opts,
)
assert result is not None
return result
pix = geo_im.data
geobox = geo_im.odc.geobox
if geobox is None:
raise ValueError("Need geo-registered array on input")
return _write_cog(
pix,
geobox,
fname,
nodata=nodata,
overwrite=overwrite,
blocksize=blocksize,
ovr_blocksize=ovr_blocksize,
overview_resampling=overview_resampling,
overview_levels=overview_levels,
use_windowed_writes=use_windowed_writes,
intermediate_compression=intermediate_compression,
gdal_metadata=_get_gdal_metadata(geo_im, tags),
**extra_rio_opts,
)
[docs]
def to_cog(
geo_im: xr.DataArray,
blocksize: Optional[int] = None,
ovr_blocksize: Optional[int] = None,
overviews: Optional[Iterable[xr.DataArray]] = None,
overview_resampling: Optional[str] = None,
overview_levels: Optional[List[int]] = None,
use_windowed_writes: bool = False,
intermediate_compression: Union[bool, str, Dict[str, Any]] = False,
tags: Optional[Dict[str, Any]] = None,
**extra_rio_opts,
) -> bytes:
"""
Compress ``xarray.DataArray`` into Cloud Optimized GeoTiff bytes in memory.
This function doesn't write to disk, it compresses in RAM, which is useful
for saving data to S3 or other cloud object stores.
:param geo_im: ``xarray.DataArray`` with crs
:param blocksize: Size of internal tiff tiles (512x512 pixels)
:param ovr_blocksize: Size of internal tiles in overview images (defaults to blocksize)
:param overviews: Write pre-computed overviews if supplied
:param overview_resampling: Use this resampling when computing overviews
:param overview_levels: List of shrink factors to compute overiews for: [2,4,8,16,32]
:param nodata: Set ``nodata`` flag to this value if supplied, by default ``nodata`` is
read from the attributes of the input array (``geo_im.attrs['nodata']``).
:param use_windowed_writes: Write image block by block (might need this for large images)
:param intermediate_compression: Configure compression settings for first pass write
, default is no compression
:param tags: Dictionary of tags to write into the output file. These are written as
GDAL Metadata items in the GeoTIFF file.
:param extra_rio_opts: Any other option is passed to ``rasterio.open``
:returns: In-memory GeoTiff file as bytes
"""
bb = write_cog(
geo_im,
":mem:",
blocksize=blocksize,
ovr_blocksize=ovr_blocksize,
overviews=overviews,
overview_resampling=overview_resampling,
overview_levels=overview_levels,
use_windowed_writes=use_windowed_writes,
intermediate_compression=intermediate_compression,
tags=tags,
**extra_rio_opts,
)
assert isinstance(
bb, (bytes,)
) # for mypy sake for :mem: output it bytes or delayed bytes
return bb
@contextmanager
def _memfiles_ovr(nlevels) -> Generator[Tuple[rasterio.MemoryFile, ...], None, None]:
tt = str(uuid4())
dirname, fname = tt.split("-", 1)
fname = fname + ".tif"
mems = []
try:
for i in range(nlevels):
mems.append(
rasterio.MemoryFile(dirname=dirname, filename=fname + ".ovr" * i)
)
yield tuple(mems)
finally:
for mem in mems[::-1]:
if not mem.closed:
mem.close()
del mems
[docs]
def write_cog_layers(
layers: Iterable[xr.DataArray],
dst: Union[str, Path] = ":mem:",
overwrite: bool = False,
blocksize: Optional[int] = None,
ovr_blocksize: Optional[int] = None,
intermediate_compression: Union[bool, str, Dict[str, Any]] = False,
use_windowed_writes: bool = False,
tags: Optional[Dict[str, Any]] = None,
nodata: Nodata = None,
**extra_rio_opts,
) -> Union[Path, bytes, None]:
"""
Write COG from externally computed overviews.
Generates in-memory image with multiple side-car overviews, then re-encodes to destination with
copy overviews flag set.
"""
xx = list(layers)
if len(xx) == 0:
return None
if dst != ":mem:":
_ = check_write_path(dst, overwrite)
if blocksize is None:
blocksize = 512
if ovr_blocksize is None:
ovr_blocksize = blocksize
pix = xx[0]
gbox: GeoBox = pix.odc.geobox
rio_opts = _default_cog_opts(
blocksize=blocksize,
shape=gbox.shape,
is_float=pix.dtype.kind == "f",
nodata=nodata,
)
rio_opts.update(extra_rio_opts)
first_pass_cfg: Dict[str, Any] = {
"num_threads": "ALL_CPUS",
"blocksize": blocksize,
"nodata": nodata,
"use_windowed_writes": use_windowed_writes,
"gdal_metadata": _get_gdal_metadata(xx, tags),
**_norm_compression_opts(intermediate_compression),
}
with _memfiles_ovr(len(xx)) as mm:
temp_fname = mm[0].name
# write each layer into mem image
for img, m in zip(xx, mm):
_write_cog(
img.data,
img.odc.geobox,
m.name,
overview_levels=[],
**first_pass_cfg,
)
# copy with recompression
with rasterio.Env(
GDAL_TIFF_OVR_BLOCKSIZE=ovr_blocksize,
GDAL_DISABLE_READDIR_ON_OPEN=False, # force finding of .ovr(s)
NUM_THREADS="ALL_CPUS",
GDAL_NUM_THREADS="ALL_CPUS",
):
if dst == ":mem:":
with rasterio.MemoryFile() as _dst:
rio_copy(temp_fname, _dst.name, copy_src_overviews=True, **rio_opts)
return bytes(_dst.getbuffer()) # makes a copy of compressed data
else:
rio_copy(temp_fname, dst, copy_src_overviews=True, **rio_opts)
return Path(dst)