Module pyl4c.lib.transcom
For all things related to TransCom regions but, in particular, for the statistical summary of SMAP L4C (or other raster array data) by TransCom region. The TransCom project seems to be poorly documented, but Carbon Tracker [1] uses it and provides the data used here. Intended use (Example):
>>> f = h5py.File(file_path)
>>> soc = f['SOC/soc_mean']
>>> tc = TransCom()
>>> tc.summarize_by_transcom(np.where(soc == -9999, 0, soc), 'M09')
{'Australia': [736.73413, 914.3717],
'Eurasia Boreal': [3593.8977, 1461.0577],
...
'Tropical Asia': [1294.8158, 1264.5378]}
Command line use:
$ python transcom.py summarize ./files/*.h5
--field="SOC/soc_mean"
--output-path="./summaries.csv"
$ python transcom.py summarize ./files/*.h5
--field="SOC/soc_mean"
--output-path="./summaries.csv" --summaries="('nanmean',)"
Classes
class TransCom
-
Expand source code
class TransCom(object): ''' Convenience class for doing zonal statistics over TransCom regions. ''' # Region codes for non-terrestrial/ off-shore regions offshore_region_codes = range(12, 22) onshore_region_codes = range(1, 12) onshore_region_labels = ( # i.e., regions 1 through 11, inclusive 'North American Boreal', 'North American Temperate', 'South American Tropical', 'South American Temperate', 'Northern Africa', 'Southern Africa', 'Eurasia Boreal', 'Eurasia Temperate', 'Tropical Asia', 'Australia', 'Europe') onshore_regions = dict(zip(onshore_region_codes, onshore_region_labels)) def __init__(self): f = netcdf_file(TRANSCOM_DATA, 'r') self.areas = f.variables['transcom_regions_area'].data.copy() regions = f.variables['transcom_regions'].data.copy() f.close() # TransCom array is oriented south-up, so flip it self.data = np.flip(regions.astype(np.int8), 0) def __filter__(self, arr): 'Filters the TransCom regions array to terrestrial codes only' return np.where( ~np.isin(self.data, self.offshore_region_codes), self.data, 0) def __resample__(self, arr, shp): 'Nearest neighbor interpolation to a new pixel size' # Nearest neighbor interpolation; even though ndimage's zoom() # gets edge pixels wrong, it is probably the best choice here return resize(arr, shp, order = 0, preserve_range = True) @cached_property def __transcom_1km__(self): rast = self.transcom_on_ease2_grid( grid = 'M01', terrestrial = True) return self.__resample__( rast.ReadAsArray(), EASE2_GRID_PARAMS['M01']['shape']) @cached_property def __transcom_9km__(self): rast = self.transcom_on_ease2_grid( grid = 'M09', terrestrial = True) return self.__resample__( rast.ReadAsArray(), EASE2_GRID_PARAMS['M09']['shape']) @cached_property def __area_by_region__(self): 'Based on a 9-km EASE-Grid (2.0), calculates area in square meters' # Reverse the dictionary so that labels identify region codes areas = dict((v, k) for k, v in self.onshore_regions.items()) transcom = self.__transcom_9km__ for i, label in self.onshore_regions.items(): areas[label] = transcom[np.where(transcom == i)].shape[0] * 81e6 return areas @property def __reported_area_by_region__(self): 'Reports area in square meters for each TransCom region' # Reverse the dictionary so that labels identify region codes areas = dict((v, k) for k, v in self.onshore_regions.items()) for i, label in self.onshore_regions.items(): areas[label] = self.areas[i] return areas def as_ease2_array(self, grid = 'M01'): ''' Returns a numpy.ndarray that has the specified EASE-Grid 2.0 shape, with the TransCom classes as values. ''' if grid == 'M01': return self.__transcom_1km__ elif grid == 'M09': return self.__transcom_9km__ else: raise ValueError('Requested grid size not available') def as_raster(self, terrestrial = True): ''' Returns the TransCom data as a `gdal.Dataset`. Parameters ---------- terrestrial : bool True to filter to only terrestrial TransCom regions (Default: True) Returns ------- gdal.Dataset ''' # TransCom (source) spatial reference system wkt0 = osr.SpatialReference() wkt0.ImportFromEPSG(4326) gt0 = (-180, 1, 0, 90, 0, -1) return array_to_raster( self.__filter__(self.data) if terrestrial else self.data, gt0, str(wkt0)) def count_ease2_by_transcom( self, array, grid = 'M01', scale = 1, text_labels = False, nodata = -9999): ''' Calculates the number of non-NaN/ non-missing values in each TransCom region within the provided array. Calls `TransCom.summarize_ease2_by_transcom()` as a subroutine. Parameters ---------- array : numpy.ndarray Data array with the same SRS, grid size as desired for the TransCom data grid : str The EASE-Grid 2.0 name for which the TransCom data should be aligned and resampled on (e.g., "M01" or "M09") scale : int or float Optional scaling parameter to apply to the input array values, e.g., if the array values are (spatial) rates and should be scaled by the (equal) area of the grid cell text_labels : bool True to use the names of the TransCom regions instead of their region codes, in the output nodata : int or float NoData or Fill value in the array data to ignore Returns ------- dict ''' count = lambda arr: np.sum(np.where(np.isnan(arr), 0, 1)) return self.summarize_ease2_by_transcom( array, count, grid, text_labels, nodata) def rescaled(self, size_degrees = 1): ''' Rescales the TransCom grid (nominally 1-degree by 1-degree) to a scalar multiple equirectangular grid size; e.g., size_degrees = 0.5 would enlarge the array from (180, 360) to (360, 720). Parameters ---------- size_degrees : int or float The output (equirectangular) grid resolution in degrees Returns ------- numpy.ndarray ''' opts = { # Options to zoom() 'order': 0, 'zoom': (1/size_degrees), 'mode': 'grid-constant', 'grid_mode': True } return zoom(self.data, **opts) def summarize_ease2_by_transcom( self, array, summaries = dict(mean = np.nanmean, std = np.nanstd), grid = 'M01', scale = 1, text_labels = False, nodata = -9999): ''' Calculates the mean and standard deviation of the input array values within each TransCom class. NOTE: The statistical summaries are accumulated as 32-bit floating point values, which is fastest, but will not be accurate for certain data types. If the input data array are large integers (e.g., soil organic carbon/ SOC values), then this should not be a problem. Parameters ---------- array : numpy.ndarray Data array with the same SRS, grid size as desired for the TransCom data summaries : dict Dictionary of {label: function} for every summary statistic desired; function should be NumPy summary function, e.g., nanmean, nansum, ... grid : str The EASE-Grid 2.0 name for which the TransCom data should be aligned and resampled on (e.g., "M01" or "M09") scale : int or float Optional scaling parameter to apply to the input array values, e.g., if the array values are (spatial) rates and should be scaled by the (equal) area of the grid cell text_labels : bool True to use the names of the TransCom regions instead of their region codes, in the output nodata : int or float NoData or Fill value in the array data to ignore Returns ------- dict Dictionary with TransCom regions as labels and a nested Dictionary with a key-value pair for each desired summary statistic. ''' assert array.ndim == 2 or (array.ndim == 3 and array.shape[0] == 1), 'Can only work with 1-band raster arrays' if array.ndim == 3: array = array[0,...] # Unwrap 1-band raster arrays # Extract grid size in km; get the resampled TransCom array g = int(re.compile(r'.*(?P<km>\d{2})').match(grid).groups()[0]) transcom = getattr(self, '__transcom_%dkm__' % g) assert transcom.shape == array.shape, 'Input array does not match the TransCom regions grid at the specified grid size' # Fill in NaN where there is NoData if nodata is not None: array = np.where(array == -9999, np.nan, array) # Determine how we will organize statistics by class label if text_labels: # Create, e.g., {'Australia': {}, ...} stats = dict([(v, dict()) for v in self.onshore_regions.values()]) else: # Create, e.g., {1: {}, 2: {}, ...} stats = dict([(k, dict()) for k in self.onshore_regions.keys()]) for code, label in self.onshore_regions.items(): i = label if text_labels else code query = np.multiply( # Scale cell values (Default = 1.0) np.where(np.isin(transcom, code), array, np.nan), scale) for stat_name, func in summaries.items(): # NOTE: Runs faster if dtype of accumulator is *not* set stats[i][stat_name] = func(query) return stats def transcom_on_ease2_grid(self, grid = 'M01', terrestrial = True): ''' Projects the equirectangular TransCom regions data onto the EASE-Grid 2.0 spatial reference system. NOTE: EASE-Grid 2.0 only extends to 84 degrees latitude North or South, so the TransCom regions ought to be restricted to this same extent; but it doesn't actually matter and the projection is still accurate. I'm making a note here in case it becomes important later: self.__y_coords__ = f.variables['lat'].data.copy() idx = np.where(np.abs(self.__y_coords__) < 84)[0] transcom = self.data[idx,:] Parameters ---------- grid : str The EASE-Grid 2.0 name for which the TransCom data should be aligned and resampled on (e.g., "M01" or "M09") terrestrial : bool True to filter to only terrestrial TransCom regions Returns ------- gdal.Dataset ''' # EASE-Grid 2.0 (target) spatial reference system wkt = osr.SpatialReference() wkt.ImportFromWkt(EPSG[EASE2_GRID_PARAMS[grid]['epsg']]) # Create a gdal.Dataset from TransCom data rast0 = self.as_raster(terrestrial) gt0 = rast0.GetGeoTransform() wkt0 = str(rast0.GetProjection()) py, px = self.data.shape px += 12 # HACK: Output is clipped for some reason (=/) # The output (projected) raster's GeoTransform is difficult to # determine, but this should do it automatically gt = gdal.AutoCreateWarpedVRT( rast0, str(wkt0), str(wkt), gdal.GRA_NearestNeighbour).GetGeoTransform() # rast0 is input raster, rast is output raster rast = gdal.GetDriverByName('MEM').Create('', px, py, 1, gdalconst.GDT_Int16) rast.SetGeoTransform(gt) rast.SetProjection(str(wkt)) gdal.ReprojectImage( rast0, rast, str(wkt0), str(wkt), gdalconst.GRA_NearestNeighbour) return rast
Convenience class for doing zonal statistics over TransCom regions.
Class variables
var offshore_region_codes
var onshore_region_codes
var onshore_region_labels
var onshore_regions
Methods
def as_ease2_array(self, grid='M01')
-
Expand source code
def as_ease2_array(self, grid = 'M01'): ''' Returns a numpy.ndarray that has the specified EASE-Grid 2.0 shape, with the TransCom classes as values. ''' if grid == 'M01': return self.__transcom_1km__ elif grid == 'M09': return self.__transcom_9km__ else: raise ValueError('Requested grid size not available')
Returns a numpy.ndarray that has the specified EASE-Grid 2.0 shape, with the TransCom classes as values.
def as_raster(self, terrestrial=True)
-
Expand source code
def as_raster(self, terrestrial = True): ''' Returns the TransCom data as a `gdal.Dataset`. Parameters ---------- terrestrial : bool True to filter to only terrestrial TransCom regions (Default: True) Returns ------- gdal.Dataset ''' # TransCom (source) spatial reference system wkt0 = osr.SpatialReference() wkt0.ImportFromEPSG(4326) gt0 = (-180, 1, 0, 90, 0, -1) return array_to_raster( self.__filter__(self.data) if terrestrial else self.data, gt0, str(wkt0))
Returns the TransCom data as a
gdal.Dataset
.Parameters
terrestrial
:bool
- True to filter to only terrestrial TransCom regions (Default: True)
Returns
gdal.Dataset
def count_ease2_by_transcom(self, array, grid='M01', scale=1, text_labels=False, nodata=-9999)
-
Expand source code
def count_ease2_by_transcom( self, array, grid = 'M01', scale = 1, text_labels = False, nodata = -9999): ''' Calculates the number of non-NaN/ non-missing values in each TransCom region within the provided array. Calls `TransCom.summarize_ease2_by_transcom()` as a subroutine. Parameters ---------- array : numpy.ndarray Data array with the same SRS, grid size as desired for the TransCom data grid : str The EASE-Grid 2.0 name for which the TransCom data should be aligned and resampled on (e.g., "M01" or "M09") scale : int or float Optional scaling parameter to apply to the input array values, e.g., if the array values are (spatial) rates and should be scaled by the (equal) area of the grid cell text_labels : bool True to use the names of the TransCom regions instead of their region codes, in the output nodata : int or float NoData or Fill value in the array data to ignore Returns ------- dict ''' count = lambda arr: np.sum(np.where(np.isnan(arr), 0, 1)) return self.summarize_ease2_by_transcom( array, count, grid, text_labels, nodata)
Calculates the number of non-NaN/ non-missing values in each TransCom region within the provided array. Calls
TransCom.summarize_ease2_by_transcom()
as a subroutine.Parameters
array
:numpy.ndarray
- Data array with the same SRS, grid size as desired for the TransCom data
grid
:str
- The EASE-Grid 2.0 name for which the TransCom data should be aligned and resampled on (e.g., "M01" or "M09")
scale
:int
orfloat
- Optional scaling parameter to apply to the input array values, e.g., if the array values are (spatial) rates and should be scaled by the (equal) area of the grid cell
text_labels
:bool
- True to use the names of the TransCom regions instead of their region codes, in the output
nodata
:int
orfloat
- NoData or Fill value in the array data to ignore
Returns
dict
def rescaled(self, size_degrees=1)
-
Expand source code
def rescaled(self, size_degrees = 1): ''' Rescales the TransCom grid (nominally 1-degree by 1-degree) to a scalar multiple equirectangular grid size; e.g., size_degrees = 0.5 would enlarge the array from (180, 360) to (360, 720). Parameters ---------- size_degrees : int or float The output (equirectangular) grid resolution in degrees Returns ------- numpy.ndarray ''' opts = { # Options to zoom() 'order': 0, 'zoom': (1/size_degrees), 'mode': 'grid-constant', 'grid_mode': True } return zoom(self.data, **opts)
Rescales the TransCom grid (nominally 1-degree by 1-degree) to a scalar multiple equirectangular grid size; e.g., size_degrees = 0.5 would enlarge the array from (180, 360) to (360, 720).
Parameters
size_degrees
:int
orfloat
- The output (equirectangular) grid resolution in degrees
Returns
numpy.ndarray
def summarize_ease2_by_transcom(self,
array,
summaries={'mean': <function nanmean>, 'std': <function nanstd>},
grid='M01',
scale=1,
text_labels=False,
nodata=-9999)-
Expand source code
def summarize_ease2_by_transcom( self, array, summaries = dict(mean = np.nanmean, std = np.nanstd), grid = 'M01', scale = 1, text_labels = False, nodata = -9999): ''' Calculates the mean and standard deviation of the input array values within each TransCom class. NOTE: The statistical summaries are accumulated as 32-bit floating point values, which is fastest, but will not be accurate for certain data types. If the input data array are large integers (e.g., soil organic carbon/ SOC values), then this should not be a problem. Parameters ---------- array : numpy.ndarray Data array with the same SRS, grid size as desired for the TransCom data summaries : dict Dictionary of {label: function} for every summary statistic desired; function should be NumPy summary function, e.g., nanmean, nansum, ... grid : str The EASE-Grid 2.0 name for which the TransCom data should be aligned and resampled on (e.g., "M01" or "M09") scale : int or float Optional scaling parameter to apply to the input array values, e.g., if the array values are (spatial) rates and should be scaled by the (equal) area of the grid cell text_labels : bool True to use the names of the TransCom regions instead of their region codes, in the output nodata : int or float NoData or Fill value in the array data to ignore Returns ------- dict Dictionary with TransCom regions as labels and a nested Dictionary with a key-value pair for each desired summary statistic. ''' assert array.ndim == 2 or (array.ndim == 3 and array.shape[0] == 1), 'Can only work with 1-band raster arrays' if array.ndim == 3: array = array[0,...] # Unwrap 1-band raster arrays # Extract grid size in km; get the resampled TransCom array g = int(re.compile(r'.*(?P<km>\d{2})').match(grid).groups()[0]) transcom = getattr(self, '__transcom_%dkm__' % g) assert transcom.shape == array.shape, 'Input array does not match the TransCom regions grid at the specified grid size' # Fill in NaN where there is NoData if nodata is not None: array = np.where(array == -9999, np.nan, array) # Determine how we will organize statistics by class label if text_labels: # Create, e.g., {'Australia': {}, ...} stats = dict([(v, dict()) for v in self.onshore_regions.values()]) else: # Create, e.g., {1: {}, 2: {}, ...} stats = dict([(k, dict()) for k in self.onshore_regions.keys()]) for code, label in self.onshore_regions.items(): i = label if text_labels else code query = np.multiply( # Scale cell values (Default = 1.0) np.where(np.isin(transcom, code), array, np.nan), scale) for stat_name, func in summaries.items(): # NOTE: Runs faster if dtype of accumulator is *not* set stats[i][stat_name] = func(query) return stats
Calculates the mean and standard deviation of the input array values within each TransCom class.
NOTE: The statistical summaries are accumulated as 32-bit floating point values, which is fastest, but will not be accurate for certain data types. If the input data array are large integers (e.g., soil organic carbon/ SOC values), then this should not be a problem.
Parameters
array
:numpy.ndarray
- Data array with the same SRS, grid size as desired for the TransCom data
summaries
:dict
- Dictionary of {label: function} for every summary statistic desired; function should be NumPy summary function, e.g., nanmean, nansum, …
grid
:str
- The EASE-Grid 2.0 name for which the TransCom data should be aligned and resampled on (e.g., "M01" or "M09")
scale
:int
orfloat
- Optional scaling parameter to apply to the input array values, e.g., if the array values are (spatial) rates and should be scaled by the (equal) area of the grid cell
text_labels
:bool
- True to use the names of the TransCom regions instead of their region codes, in the output
nodata
:int
orfloat
- NoData or Fill value in the array data to ignore
Returns
dict
- Dictionary with TransCom regions as labels and a nested Dictionary with a key-value pair for each desired summary statistic.
def transcom_on_ease2_grid(self, grid='M01', terrestrial=True)
-
Expand source code
def transcom_on_ease2_grid(self, grid = 'M01', terrestrial = True): ''' Projects the equirectangular TransCom regions data onto the EASE-Grid 2.0 spatial reference system. NOTE: EASE-Grid 2.0 only extends to 84 degrees latitude North or South, so the TransCom regions ought to be restricted to this same extent; but it doesn't actually matter and the projection is still accurate. I'm making a note here in case it becomes important later: self.__y_coords__ = f.variables['lat'].data.copy() idx = np.where(np.abs(self.__y_coords__) < 84)[0] transcom = self.data[idx,:] Parameters ---------- grid : str The EASE-Grid 2.0 name for which the TransCom data should be aligned and resampled on (e.g., "M01" or "M09") terrestrial : bool True to filter to only terrestrial TransCom regions Returns ------- gdal.Dataset ''' # EASE-Grid 2.0 (target) spatial reference system wkt = osr.SpatialReference() wkt.ImportFromWkt(EPSG[EASE2_GRID_PARAMS[grid]['epsg']]) # Create a gdal.Dataset from TransCom data rast0 = self.as_raster(terrestrial) gt0 = rast0.GetGeoTransform() wkt0 = str(rast0.GetProjection()) py, px = self.data.shape px += 12 # HACK: Output is clipped for some reason (=/) # The output (projected) raster's GeoTransform is difficult to # determine, but this should do it automatically gt = gdal.AutoCreateWarpedVRT( rast0, str(wkt0), str(wkt), gdal.GRA_NearestNeighbour).GetGeoTransform() # rast0 is input raster, rast is output raster rast = gdal.GetDriverByName('MEM').Create('', px, py, 1, gdalconst.GDT_Int16) rast.SetGeoTransform(gt) rast.SetProjection(str(wkt)) gdal.ReprojectImage( rast0, rast, str(wkt0), str(wkt), gdalconst.GRA_NearestNeighbour) return rast
Projects the equirectangular TransCom regions data onto the EASE-Grid 2.0 spatial reference system.
NOTE: EASE-Grid 2.0 only extends to 84 degrees latitude North or South, so the TransCom regions ought to be restricted to this same extent; but it doesn't actually matter and the projection is still accurate. I'm making a note here in case it becomes important later:
self.__y_coords__ = f.variables['lat'].data.copy() idx = np.where(np.abs(self.__y_coords__) < 84)[0] transcom = self.data[idx,:]
Parameters
grid
:str
- The EASE-Grid 2.0 name for which the TransCom data should be aligned and resampled on (e.g., "M01" or "M09")
terrestrial
:bool
- True to filter to only terrestrial TransCom regions
Returns
gdal.Dataset
class TransComCLI (output_path=None,
field='SOC/soc_mean',
summaries=('nanmean', 'nanstd'),
grid='M09',
**kwargs)-
Expand source code
class TransComCLI(CommandLineInterface): ''' A command-line interface (CLI) for convenience; used with Google fire. ''' def __init__( self, output_path = None, field = 'SOC/soc_mean', summaries = ('nanmean', 'nanstd'), grid = 'M09', **kwargs): self._output_path = output_path self._field = field self._grid = grid self._summaries = summaries self._kwargs = kwargs self._kwargs['grid'] = grid self.tc = TransCom() def __check__(self): assert self._output_path is not None,\ 'You must specify an output_path with: --output-path=""' assert os.path.exists(os.path.dirname(self._output_path)),\ 'Did not recognize output_path (Cannot use shortcuts like ~)' def __dump_csv__(self, output_path, items): with open(output_path, 'w') as stream: writer = csv.writer(stream, delimiter = ',', quotechar = '"') writer.writerow(('filename', 'transcom_region', 'statistic', 'value')) for each in items: writer.writerow(each) def __expand_summary__(self, items): 'Expands ["A", {"a": 1, "b": 2}] into [("A", "a", 1), ("A", "b", 2)]' new_items = [] # TODO Would be nice to generalize this using a recursive pattern for filename, d in items: for label, summary in d.items(): for key, value in summary.items(): new_items.append((filename, label, key, value)) return new_items def report_areas(self): ''' Prints the reported area of each TransCom region. ''' # NOTE: The underlying data are 32-bit floating point, so this # string formatting will maximize the precision for k, v, in self.tc.__reported_area_by_region__.items(): print('%s: %.0f' % (k, v)) def summarize(self, *file_paths): ''' Creates a statistical summary by class label (TransCom region) for each of multiple input HDF5 files. Use: $ python transcom.py summarize ./files/*.h5 --field="SOC/soc_mean" --output-path="./summaries.csv" $ python transcom.py summarize ./files/*.h5 --field="SOC/soc_mean" --output-path="./summaries.csv" --summaries="('nanmean',)" Parameters ---------- *file_paths : str field : str Hierarchical path name of field to summarize: `--field="<field>"` summaries : str For compatibility at the CLI, can describe the statistical summary functions desired as a comma-delimited string, e.g., `"('nanmean','nanstd')"` where names refer to NumPy functions output_path : str File path where the output CSV file should be written **kwargs : str Other keyword arguments to be passed on to `TransCom.summarize_ease2_by_transcom()` ''' self.__check__() results = [] n = len(file_paths) assert n > 0, 'No file paths given (Did you forget to specify "field" argument?)' # NOTE: fire.Fire() implicitly transforms comma-delimited string into # tuple; here, e.g., "('nanmean', 'nanstd')" becomes: # {'nanmean': np.nanmean, 'nanstd': np.nanstd} self._kwargs['summaries'] = dict([ (name, getattr(np, name)) for name in self._summaries ]) # Determine what kind of files we're working with mode = 'other' if file_paths[0].split('.')[-1] == 'h5': mode = 'hdf5' elif file_paths[0].split('.')[-1] in TYPE_MAP.keys(): mode = 'sparse' with ProgressBar(len(file_paths), 'Summarizing files...') as progress: for i, filename in enumerate(file_paths): if mode == 'hdf5': with h5py.File(filename, 'r') as hdf: arr = hdf[self._field][:] elif mode == 'sparse': tcf = TCFArray(filename, self._grid) tcf.inflate() arr = tcf.data elif mode == 'other': arr, _, _ = as_array(filename, band_axis = False) stats = self.tc.summarize_ease2_by_transcom(arr, **self._kwargs) results.append((filename, stats)) progress.update(i + 1) self.__dump_csv__(self._output_path, self.__expand_summary__(results))
A command-line interface (CLI) for convenience; used with Google fire.
Ancestors
Methods
def report_areas(self)
-
Expand source code
def report_areas(self): ''' Prints the reported area of each TransCom region. ''' # NOTE: The underlying data are 32-bit floating point, so this # string formatting will maximize the precision for k, v, in self.tc.__reported_area_by_region__.items(): print('%s: %.0f' % (k, v))
Prints the reported area of each TransCom region.
def summarize(self, *file_paths)
-
Expand source code
def summarize(self, *file_paths): ''' Creates a statistical summary by class label (TransCom region) for each of multiple input HDF5 files. Use: $ python transcom.py summarize ./files/*.h5 --field="SOC/soc_mean" --output-path="./summaries.csv" $ python transcom.py summarize ./files/*.h5 --field="SOC/soc_mean" --output-path="./summaries.csv" --summaries="('nanmean',)" Parameters ---------- *file_paths : str field : str Hierarchical path name of field to summarize: `--field="<field>"` summaries : str For compatibility at the CLI, can describe the statistical summary functions desired as a comma-delimited string, e.g., `"('nanmean','nanstd')"` where names refer to NumPy functions output_path : str File path where the output CSV file should be written **kwargs : str Other keyword arguments to be passed on to `TransCom.summarize_ease2_by_transcom()` ''' self.__check__() results = [] n = len(file_paths) assert n > 0, 'No file paths given (Did you forget to specify "field" argument?)' # NOTE: fire.Fire() implicitly transforms comma-delimited string into # tuple; here, e.g., "('nanmean', 'nanstd')" becomes: # {'nanmean': np.nanmean, 'nanstd': np.nanstd} self._kwargs['summaries'] = dict([ (name, getattr(np, name)) for name in self._summaries ]) # Determine what kind of files we're working with mode = 'other' if file_paths[0].split('.')[-1] == 'h5': mode = 'hdf5' elif file_paths[0].split('.')[-1] in TYPE_MAP.keys(): mode = 'sparse' with ProgressBar(len(file_paths), 'Summarizing files...') as progress: for i, filename in enumerate(file_paths): if mode == 'hdf5': with h5py.File(filename, 'r') as hdf: arr = hdf[self._field][:] elif mode == 'sparse': tcf = TCFArray(filename, self._grid) tcf.inflate() arr = tcf.data elif mode == 'other': arr, _, _ = as_array(filename, band_axis = False) stats = self.tc.summarize_ease2_by_transcom(arr, **self._kwargs) results.append((filename, stats)) progress.update(i + 1) self.__dump_csv__(self._output_path, self.__expand_summary__(results))
Creates a statistical summary by class label (TransCom region) for each of multiple input HDF5 files. Use:
$ python transcom.py summarize ./files/*.h5 --field="SOC/soc_mean" --output-path="./summaries.csv" $ python transcom.py summarize ./files/*.h5 --field="SOC/soc_mean" --output-path="./summaries.csv" --summaries="('nanmean',)"
Parameters
*file_paths
:str
field
:str
- Hierarchical path name of field to summarize:
--field="<field>"
summaries
:str
- For compatibility at the CLI, can describe the statistical summary
functions desired as a comma-delimited string, e.g.,
"('nanmean','nanstd')"
where names refer to NumPy functions output_path
:str
- File path where the output CSV file should be written
**kwargs
:str
- Other keyword arguments to be passed on to
TransCom.summarize_ease2_by_transcom()
Inherited members