most recent 30 from stackoverflow.com 2026-05-01T13:04:35Z https://stackoverflow.com/feeds/tag?tagnames=python-xarray https://creativecommons.org/licenses/by-sa/4.0/rdf https://stackoverflow.com/q/79934251 1 Ramiro chalar https://stackoverflow.com/users/17151636 2026-04-30T13:32:51Z 2026-04-30T13:32:51Z <p>I'm trying to load some .nc files this way:</p> <pre><code>ds = xr.open_dataset('path/to/file.nc') </code></pre> <p>At first I get no error message, but when I try to operate or simply visualize the data I get this really long error message:</p> <pre><code>ds --------------------------------------------------------------------------- InvalidVersion Traceback (most recent call last) File ~\anaconda3\Lib\site-packages\IPython\core\formatters.py:406, in BaseFormatter.__call__(self, obj) 404 method = get_real_method(obj, self.print_method) 405 if method is not None: --&gt; 406 return method() 407 return None 408 else: File ~\anaconda3\Lib\site-packages\xarray\core\dataset.py:2436, in Dataset._repr_html_(self) 2434 if OPTIONS[&quot;display_style&quot;] == &quot;text&quot;: 2435 return f&quot;&lt;pre&gt;{escape(repr(self))}&lt;/pre&gt;&quot; -&gt; 2436 return formatting_html.dataset_repr(self) File ~\anaconda3\Lib\site-packages\xarray\core\formatting_html.py:377, in dataset_repr(ds) 374 sections.append(dim_section(ds)) 376 if ds.coords: --&gt; 377 sections.append(coord_section(ds.coords)) 379 sections.append(datavar_section(ds.data_vars)) 381 display_default_indexes = _get_boolean_with_default( 382 &quot;display_default_indexes&quot;, False 383 ) File ~\anaconda3\Lib\site-packages\xarray\core\formatting_html.py:225, in _mapping_section(mapping, name, details_func, max_items_collapse, expand_option_name, enabled, **kwargs) 218 expanded = max_items_collapse is None or _get_boolean_with_default( 219 expand_option_name, n_items &lt; max_items_collapse 220 ) 221 collapsed = not expanded 223 return collapsible_section( 224 f&quot;{name}:&quot;, --&gt; 225 details=details_func(mapping, **kwargs), 226 n_items=n_items, 227 enabled=enabled, 228 collapsed=collapsed, 229 ) File ~\anaconda3\Lib\site-packages\xarray\core\formatting_html.py:127, in summarize_coords(variables) 123 dim_ordered_coords = sorted( 124 variables.items(), key=partial(_coord_sort_key, dims=dims) 125 ) 126 for k, v in dim_ordered_coords: --&gt; 127 li_content = summarize_variable(k, v, is_index=k in variables.xindexes) 128 li_items.append(f&quot;&lt;li class='xr-var-item'&gt;{li_content}&lt;/li&gt;&quot;) 130 vars_li = &quot;&quot;.join(li_items) File ~\anaconda3\Lib\site-packages\xarray\core\formatting_html.py:96, in summarize_variable(name, var, is_index, dtype) 93 data_id = &quot;data-&quot; + str(uuid.uuid4()) 94 disabled = &quot;&quot; if len(var.attrs) else &quot;disabled&quot; ---&gt; 96 preview = escape(inline_variable_array_repr(variable, 35)) 97 attrs_ul = summarize_attrs(var.attrs) 98 data_repr = short_data_repr_html(variable) File ~\anaconda3\Lib\site-packages\xarray\core\formatting.py:316, in inline_variable_array_repr(var, max_width) 314 if getattr(var, &quot;_in_memory&quot;, False): 315 return format_array_flat(var, max_width) --&gt; 316 dask_array_type = array_type(&quot;dask&quot;) 317 if isinstance(var._data, dask_array_type): 318 return inline_dask_repr(var.data) File ~\anaconda3\Lib\site-packages\xarray\namedarray\pycompat.py:83, in array_type(mod) 81 def array_type(mod: ModType) -&gt; DuckArrayTypes: 82 &quot;&quot;&quot;Quick wrapper to get the array class of the module.&quot;&quot;&quot; ---&gt; 83 return _get_cached_duck_array_module(mod).type File ~\anaconda3\Lib\site-packages\xarray\namedarray\pycompat.py:74, in _get_cached_duck_array_module(mod) 72 def _get_cached_duck_array_module(mod: ModType) -&gt; DuckArrayModule: 73 if mod not in _cached_duck_array_modules: ---&gt; 74 duckmod = DuckArrayModule(mod) 75 _cached_duck_array_modules[mod] = duckmod 76 return duckmod File ~\anaconda3\Lib\site-packages\xarray\namedarray\pycompat.py:40, in DuckArrayModule.__init__(self, mod) 38 try: 39 duck_array_module = import_module(mod) ---&gt; 40 duck_array_version = Version(duck_array_module.__version__) 42 if mod == &quot;dask&quot;: 43 duck_array_type = (import_module(&quot;dask.array&quot;).Array,) File ~\anaconda3\Lib\site-packages\packaging\version.py:202, in Version.__init__(self, version) 200 match = self._regex.search(version) 201 if not match: --&gt; 202 raise InvalidVersion(f&quot;Invalid version: {version!r}&quot;) 204 # Store the parsed out pieces of the version 205 self._version = _Version( 206 epoch=int(match.group(&quot;epoch&quot;)) if match.group(&quot;epoch&quot;) else 0, 207 release=tuple(int(i) for i in match.group(&quot;release&quot;).split(&quot;.&quot;)), (...) 213 local=_parse_local_version(match.group(&quot;local&quot;)), 214 ) InvalidVersion: Invalid version: 'unknown' </code></pre> <p>(this isn't actually the full message, but it wouldn't let me submit with such a long code section)</p> <p>I tried looking for answers but couldn't find anyone with the same error message.</p> https://stackoverflow.com/q/79931486 2 Ramiro chalar https://stackoverflow.com/users/17151636 2026-04-24T18:00:51Z 2026-04-24T21:51:27Z <p>I have an xarray <code>DataArray</code> that contains daily data for every day from 1970 to 2023 for the months October through March, and I want to calculate the average value of the variable for each day (e.g. the average value for all January 1sts).</p> <p>My data is like this:</p> <pre><code>&lt;xarray.DataArray 'u' (valid_time: 9990, latitude: 321, longitude: 361)&gt; Size: 5GB dask.array&lt;getitem, shape=(9990, 321, 361), dtype=float32, chunksize=(260, 161, 181), chunktype=numpy.ndarray&gt; Coordinates: number int64 8B 0 pressure_level float64 8B 500.0 * latitude (latitude) float64 3kB 0.0 -0.25 -0.5 ... -79.5 -79.75 -80.0 * longitude (longitude) float64 3kB -100.0 -99.75 -99.5 ... -10.25 -10.0 * valid_time (valid_time) datetime64[ns] 80kB 1970-01-01 ... 2023-12-31 </code></pre> <p>I've tried with <code>groupby</code> but only managed to group the data either by month or by day (i.e. all 1sts of the month regardless of month), not both. I can regroup each of these groups, but I bet there's a more elegant solution. Regrouping the groups looks like this:</p> <pre class="lang-py prettyprint-override"><code>da.groupby('valid_time.month')[1].groupby('valid_time.day')[1] </code></pre> <p>That would result in all the January 1sts, and I could technically do it for all the days.</p> <p>I also tried something like in <a href="https://stackoverflow.com/questions/43145920/calculate-maximum-value-by-day-of-the-year-over-certain-period">this post</a>:</p> <pre class="lang-py prettyprint-override"><code>da.groupby(da.index.strftime('%m-%d')).mean() </code></pre> <p>but I get the message:</p> <pre><code>AttributeError: 'DataArray' object has no attribute 'index' </code></pre> <p>The desired result would be something of shape (D, latitude, longitude), where D is the number of days between October 1st and March 31st (without all the February 29ths, which were removed).</p> https://stackoverflow.com/q/79821864 3 helpmeplease https://stackoverflow.com/users/31889448 2025-11-17T02:25:17Z 2026-03-17T21:46:12Z <p>I am trying to analyze the 30 day standardized precipitation index for a multi-state range of the southeastern US for the year 2016. I'm using xclim to process a direct pull of gridded daily precipitation data from nClimGrid-Daily, which involves pulling about 40 years of data and doing a 30-day rolling mean of the daily sum precip. Resolution is daily and 48.3 km. What I expected was the code to provide me the 30-day SPI for the entire range which I could then subset for the year 2016. Instead, I'll run this on my school's HPC via ssh and then get kicked off the compute node halfway (compute node has about 70gb of RAM). I can run it on my local machine, but gets too hot to complete.</p> <p>I've downloaded the range I needed (8.8gb). After geographically and temporally subsetting, it's around 7gb. I can lazy load everything, but when I persist/compute with dask it just keeps stalling after running for a certain amount of time - a lot of processing power is going towards rechunking by xclim even though I've tried rechunking prior to avoid it. As a note, when I use a coarser spatial resolution (county-level vs gridded) I can run this in less than an hour. With gridded, it breaks after about 3 hours. Please let me know if you have any suggestions!</p> <pre><code>import xarray as xr from dask.distributed import Client, LocalCluster import os from pathlib import Path from concurrent.futures import ThreadPoolExecutor, as_completed import requests #### bulk download files base_url = &quot;https://www.ncei.noaa.gov/data/nclimgrid-daily/access/grids&quot; years = range(1976, 2017) months = range(1, 13) out_dir = Path(&quot;./nclimgrid_prcp&quot;) # adjust as needed out_dir.mkdir(parents=True, exist_ok=True) max_workers = 16 timeout = 60 # seconds def dl(year, month): fname = f&quot;ncdd-{year}{month:02}-grd-scaled.nc&quot; url = f&quot;{base_url}/{year}/{fname}&quot; path = out_dir / fname if path.exists(): return path, &quot;exists&quot; try: r = requests.get(url, timeout=timeout, stream=True) r.raise_for_status() with open(path, &quot;wb&quot;) as f: for chunk in r.iter_content(chunk_size=2**20): if chunk: f.write(chunk) return path, &quot;downloaded&quot; except Exception as e: return path, f&quot;error: {e}&quot; futures = [] with ThreadPoolExecutor(max_workers=max_workers) as ex: for y in years: for m in months: futures.append(ex.submit(dl, y, m)) files = [] for fut in as_completed(futures): p, status = fut.result() print(p.name, status) if status in (&quot;exists&quot;, &quot;downloaded&quot;): files.append(str(p)) ####### End download files files = sorted(glob.glob('nclimgrid_prcp/*.nc')) #access local files def subset(ds): return ds.sel( lat=slice(30, 38), lon=slice(-92, -78), ) ds = xr.open_mfdataset(files, combine=&quot;by_coords&quot; preprocess=subset,engine='netcdf4') pr = ds['prcp'] pr = pr.assign_attrs(units=&quot;mm/day&quot;) pr = pr.chunk( { &quot;time&quot;: -1, &quot;lat&quot;: 64, &quot;lon&quot;: 84, } ) cluster = LocalCluster() client = Client(cluster) client.cluster.scale(10) spi_30 = xc.indices.standardized_precipitation_index( pr, freq=&quot;D&quot;, window=30) </code></pre> https://stackoverflow.com/q/79595427 3 Ryan Connelly https://stackoverflow.com/users/8540864 2025-04-27T19:43:41Z 2026-03-14T21:09:14Z <p>An example HRRR file I'm trying to open with xarray: <a href="https://storage.googleapis.com/high-resolution-rapid-refresh/hrrr.20250427/conus/hrrr.t18z.wrfprsf06.grib2" rel="nofollow noreferrer">https://storage.googleapis.com/high-resolution-rapid-refresh/hrrr.20250427/conus/hrrr.t18z.wrfprsf06.grib2</a></p> <p>When trying to open the dataset with xarray:</p> <pre><code>data = xr.open_dataset(HRRR_data_path+model_file, engine=&quot;cfgrib&quot;, backend_kwargs= {'filter_by_keys': {'typeOfLevel': 'isobaricInhPa'}}) </code></pre> <p>I encounter this error:</p> <pre><code>Traceback (most recent call last): File &quot;C:\Users\rconn\Documents\Python_Scripts\HRRR_four_level_stdev_omega_both_forecast_new.py&quot;, line 387, in &lt;module&gt; main(sys.argv[1:]) File &quot;C:\Users\rconn\Documents\Python_Scripts\HRRR_four_level_stdev_omega_both_forecast_new.py&quot;, line 216, in main data = xr.open_dataset(HRRR_data_path+model_file, engine=&quot;cfgrib&quot;, backend_kwargs= {'filter_by_keys': {'typeOfLevel': 'isobaricInhPa'}}) File &quot;C:\Users\rconn\anaconda3\Lib\site-packages\xarray\backends\api.py&quot;, line 552, in open_dataset backend = plugins.get_backend(engine) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File &quot;C:\Users\rconn\anaconda3\Lib\site-packages\xarray\backends\plugins.py&quot;, line 205, in get_backend raise ValueError( ValueError: unrecognized engine cfgrib must be one of: ['netcdf4', 'h5netcdf', 'scipy', 'gini', 'store', 'zarr'] </code></pre> <p>I already installed eccodes (through pip) and cfgrib (through conda). If I run</p> <pre><code>python -m cfgrib selfcheck </code></pre> <p>that returns</p> <pre><code>Found: ecCodes v2.24.2. Your system is ready. </code></pre> <p>yet the error about cfgrib being an unrecognized engine persists.</p> <p>I'm using xarray 2023.6.0 in python 3.12.7 on a Win64 PC.</p> https://stackoverflow.com/q/79862585 0 Sebastian Engelstaedter https://stackoverflow.com/users/32157927 2026-01-07T16:08:12Z 2026-02-26T07:02:29Z <p>I am trying to read multiple ERA5 GRIB files using the Python Xarray's <code>xr.mfopen_dataset()</code> method. My ERA5 input files are on a server in a location where I only have read permission for. Therefore, I would like to save the index files generated by the cfgrib engine in a local custom directory. This works fine for a single GRIB file using the following code below where I can specify the index filename explicitly.</p> <pre><code>import xarray as xr from pathlib import Path # define input file ifile = Path('/soge-home/data/analysis/era5/0.28125x0.28125/hourly/total_precipitation/grb/era5_hourly_total_precipitation_202505.grb') # create index directory workdir = Path.cwd() idx_dir = Path(f'{workdir}/idx_files') idx_dir.mkdir(exist_ok=True, parents=True) # read dataset ds = xr.open_dataset(ifile, chunks='auto', engine='cfgrib', backend_kwargs={ &quot;indexpath&quot;: str(idx_dir / ifile.name) + &quot;.{short_hash}.idx&quot;, 'time_dims': ('valid_time',) # use &quot;valid_time&quot; as main time dim }, ).rename({'valid_time': 'time'}) # rename &quot;valid_time&quot; to &quot;time&quot; </code></pre> <p>Now I try to employ the same method reading in multiple GRIB files using the code below.</p> <pre><code>import xarray as xr from pathlib import Path # create index directory workdir = Path.cwd() idx_dir = Path(f'{workdir}/idx_files') idx_dir.mkdir(exist_ok=True, parents=True) # create sorted list of input files var_dir = Path('/soge-home/data/analysis/era5/0.28125x0.28125/hourly/total_precipitation/grb/') ifiles = sorted(var_dir.glob(f'*_2025*.grb')) # read data with custom index files ds = xr.open_mfdataset(ifiles, chunks='auto', engine='cfgrib', combine='nested', concat_dim='valid_time', parallel=True, backend_kwargs={ 'indexpath': str(idx_dir) + &quot;/test.idx&quot;, # custom index file location 'time_dims': ('valid_time',) # use valid_time as main time dim }, ).rename({'valid_time': 'time'}) # rename valid_time to time </code></pre> <p>The files are read in fine by xr.open_mfdataset(), but no index files are being created. I am getting the following error messages:</p> <pre><code>Can't read index file '/hn01-home/worc1870/research/python/progs/grb_test/idx_files/test.idx' Traceback (most recent call last): File &quot;/hn01-home/worc1870/micromamba/envs/mm313/lib/python3.13/site-packages/cfgrib/messages.py&quot;, line 551, in from_indexpath_or_filestream self = cls.from_indexpath(indexpath) File &quot;/hn01-home/worc1870/micromamba/envs/mm313/lib/python3.13/site-packages/cfgrib/messages.py&quot;, line 430, in from_indexpath index = pickle.load(file) EOFError: Ran out of input Ignoring index file '/hn01-home/worc1870/research/python/progs/grb_test/idx_files/test.idx' incompatible with GRIB file Ignoring index file '/hn01-home/worc1870/research/python/progs/grb_test/idx_files/test.idx' incompatible with GRIB file Ignoring index file '/hn01-home/worc1870/research/python/progs/grb_test/idx_files/test.idx' incompatible with GRIB file </code></pre> <p>I tried all kind of things but did not get it to work. Am I doing something wrong or is this just not supported at the moment? Any suggestions are much appreciated. The index files are especially useful when reading in larger data volumes.</p> <p>Using Python 3.13.7, Xarray 2025.10.1 and cfgrib 0.9.15.1.</p> <p>Many thanks,<br /> Sebastian</p> https://stackoverflow.com/q/79873181 0 arctic_climate_science https://stackoverflow.com/users/1580516 2026-01-21T20:26:29Z 2026-01-22T11:54:05Z <p>I have 2 elevation datasets -- one is a low resolution dataset (~0.1 degree resolution), and another is a high resolution dataset (~0.001 degree resolution). Let's assume that the datasets look something like this (in reality they are much larger, so I will need to find a way to do this in a memory efficient manner, such as through chunking):</p> <pre><code>import numpy as np import pandas as pd import geopandas as gpd import xarray as xr lr_elev = np.random.uniform(low=0, high=4000, size=(11, 11)) hr_elev = np.random.uniform(low=0, high=4000, size=(1001, 1001)) lr_lats = np.linspace(44.0, 45.0, 11) lr_lons = np.linspace(-115.0, -114.0, 11) hr_lats = np.linspace(44.0, 45.0, 1001) hr_lons = np.linspace(-115.0, -114.0, 1001) lr_ds = xr.Dataset( data_vars=dict(elevation=([&quot;lat&quot;, &quot;lon&quot;], lr_elev)), coords=dict(lon=(&quot;lon&quot;, lr_lons), lat=(&quot;lat&quot;, lr_lats)), ) hr_ds = xr.Dataset( data_vars=dict(elevation=([&quot;lat&quot;, &quot;lon&quot;], hr_elev)), coords=dict(lon=(&quot;lon&quot;, hr_lons), lat=(&quot;lat&quot;, hr_lats)), ) print(lr_ds) print(hr_ds) </code></pre> <p>I know how to find the nearest elevation value from the low resolution dataset for each of the high resolution lat/lon pairs:</p> <pre><code># ### Treat midpoint of HR grid cells as a point -- then find the nearest LR grid cell as if they are stations nearest_lr_point = lr_ds.sel(lat=hr_ds.lat,lon=hr_ds.lon, method='nearest') print(nearest_lr_point) </code></pre> <p>However, I haven't been able to figure out how I would instead return the <strong>index</strong> of the value instead of the actual value. The reason for this is that I would like to be able to match each high-res lat/lon pair with its corresponding low-resolution grid cell (for later calculations). Is there a way to return the index of the value instead for each lat/lon pair?</p> <p>I am unable to convert this into a dataframe easily (since the real high resolution dataset is about 700GB in size)</p> https://stackoverflow.com/q/79862953 0 imantha https://stackoverflow.com/users/10639382 2026-01-08T04:54:12Z 2026-01-08T07:50:54Z <p>I am wondering how to add the crs to xarray dataset. For example if we have a min and max dataset and compute the mean. How can we set the mean dataset to have the same CRS as the min or max datasets ?</p> <pre><code>max_ds = xr.open_dataset(&quot;data/climate_processed/1958_2025_nsw_max_temp.nc&quot;) min_ds = xr.open_dataset(&quot;data/climate_processed/1958_2025_nsw_min_temp.nc&quot;) mean_temp = (max_ds[&quot;max_temp&quot;] + min_ds[&quot;min_temp&quot;]) / 2 mean_ds = mean_temp.to_dataset(name = &quot;temp_mean&quot;) </code></pre> <p>I have copied the outputs max_ds and mean_ds just incase they are useful.<br /> Here is the printed output of max_ds</p> <pre><code>print(max_ds.variables) Frozen({'max_temp': &lt;xarray.Variable (time: 24412, lat: 187, lon: 261)&gt; Size: 10GB [1191476484 values with dtype=float64] Attributes: long_name: Maximum temperature units: Celsius valid_min: -90 valid_max: 540, 'crs': &lt;xarray.Variable (time: 24412)&gt; Size: 24kB [24412 values with dtype=|S1] Attributes: long_name: Coordinate reference system grid_mapping_name: latitude_longitude longitude_of_prime_meridian: 0.0 semi_major_axis: 6378137.0 inverse_flattening: 298.257223563, 'lat': &lt;xarray.IndexVariable 'lat' (lat: 187)&gt; Size: 1kB array([-37.5 , -37.45, -37.4 , -37.35, -37.3 , -37.25, -37.2 , -37.15, -37.1 , -37.05, -37. , -36.95, -36.9 , -36.85, -36.8 , -36.75, -36.7 , -36.65, ... -28.95, -28.9 , -28.85, -28.8 , -28.75, -28.7 , -28.65, -28.6 , -28.55, -28.5 , -28.45, -28.4 , -28.35, -28.3 , -28.25, -28.2 ]) Attributes: long_name: latitude standard_name: latitude units: degrees_north axis: Y, 'lon': &lt;xarray.IndexVariable 'lon' (lon: 261)&gt; Size: 2kB array([141. , 141.05, 141.1 , ..., 153.9 , 153.95, 154. ], shape=(261,)) Attributes: long_name: longitude standard_name: longitude units: degrees_east axis: X, 'time': &lt;xarray.IndexVariable 'time' (time: 24412)&gt; Size: 195kB array(['1958-01-01T00:00:00.000000000', '1958-01-02T00:00:00.000000000', '1958-01-03T00:00:00.000000000', ..., '2025-10-31T00:00:00.000000000', '2025-11-01T00:00:00.000000000', '2025-11-02T00:00:00.000000000'], shape=(24412,), dtype='datetime64[ns]') Attributes: axis: T}) </code></pre> <p>printed output of mean_ds</p> <pre><code>print(mean_ds.varaible) Frozen({'lat': &lt;xarray.IndexVariable 'lat' (lat: 187)&gt; Size: 1kB array([-37.5 , -37.45, -37.4 , -37.35, -37.3 , -37.25, -37.2 , -37.15, -37.1 , -37.05, -37. , -36.95, -36.9 , -36.85, -36.8 , -36.75, -36.7 , -36.65, -36.6 , -36.55, -36.5 , -36.45, -36.4 , -36.35, -36.3 , -36.25, -36.2 , -36.15, -36.1 , -36.05, -36. , -35.95, -35.9 , -35.85, -35.8 , -35.75, -35.7 , -35.65, -35.6 , -35.55, -35.5 , -35.45, -35.4 , -35.35, -35.3 , -35.25, -35.2 , -35.15, -35.1 , -35.05, -35. , -34.95, -34.9 , -34.85, -34.8 , -34.75, -34.7 , -34.65, -34.6 , -34.55, -34.5 , -34.45, -34.4 , -34.35, -34.3 , -34.25, -34.2 , -34.15, -34.1 , -34.05, -34. , -33.95, -33.9 , -33.85, -33.8 , -33.75, -33.7 , -33.65, -33.6 , -33.55, -33.5 , -33.45, -33.4 , -33.35, -33.3 , -33.25, -33.2 , -33.15, -33.1 , -33.05, -33. , -32.95, -32.9 , -32.85, -32.8 , -32.75, -32.7 , -32.65, -32.6 , -32.55, -32.5 , -32.45, -32.4 , -32.35, -32.3 , -32.25, -32.2 , -32.15, -32.1 , -32.05, -32. , -31.95, -31.9 , -31.85, -31.8 , -31.75, -31.7 , -31.65, -31.6 , -31.55, -31.5 , -31.45, -31.4 , -31.35, -31.3 , -31.25, -31.2 , -31.15, -31.1 , -31.05, -31. , -30.95, -30.9 , -30.85, -30.8 , -30.75, -30.7 , -30.65, -30.6 , -30.55, -30.5 , -30.45, -30.4 , -30.35, -30.3 , -30.25, -30.2 , -30.15, -30.1 , -30.05, -30. , -29.95, -29.9 , -29.85, -29.8 , -29.75, -29.7 , -29.65, -29.6 , -29.55, -29.5 , -29.45, -29.4 , -29.35, -29.3 , -29.25, -29.2 , -29.15, -29.1 , -29.05, -29. , -28.95, -28.9 , -28.85, -28.8 , -28.75, -28.7 , -28.65, -28.6 , -28.55, -28.5 , -28.45, -28.4 , -28.35, -28.3 , -28.25, -28.2 ]) Attributes: long_name: latitude standard_name: latitude units: degrees_north axis: Y, 'lon': &lt;xarray.IndexVariable 'lon' (lon: 261)&gt; Size: 2kB array([141. , 141.05, 141.1 , ..., 153.9 , 153.95, 154. ], shape=(261,)) Attributes: long_name: longitude standard_name: longitude units: degrees_east axis: X, 'time': &lt;xarray.IndexVariable 'time' (time: 24412)&gt; Size: 195kB array(['1958-01-01T00:00:00.000000000', '1958-01-02T00:00:00.000000000', '1958-01-03T00:00:00.000000000', ..., '2025-10-31T00:00:00.000000000', '2025-11-01T00:00:00.000000000', '2025-11-02T00:00:00.000000000'], shape=(24412,), dtype='datetime64[ns]') Attributes: axis: T, 'temp_mean': &lt;xarray.Variable (time: 24412, lat: 187, lon: 261)&gt; Size: 10GB array([[[14.2 , 14.3 , 14.35, ..., nan, nan, nan], [14.3 , 14.4 , 14.45, ..., nan, nan, nan], [14.3 , 14.45, 14.5 , ..., nan, nan, nan], ..., [23.1 , 23.1 , 23.05, ..., nan, nan, nan], [23.2 , 23.15, 23.1 , ..., nan, nan, nan], [23.25, 23.2 , 23.1 , ..., nan, nan, nan]], [[13.25, 13.25, 13.2 , ..., nan, nan, nan], [13.35, 13.3 , 13.25, ..., nan, nan, nan], [13.4 , 13.35, 13.3 , ..., nan, nan, nan], ..., [25.5 , 25.5 , 25.45, ..., nan, nan, nan], [25.55, 25.55, 25.5 , ..., nan, nan, nan], [25.55, 25.55, 25.5 , ..., nan, nan, nan]], [[14.9 , 15. , 15. , ..., nan, nan, nan], [15.05, 15.05, 15.1 , ..., nan, nan, nan], [15.1 , 15.15, 15.15, ..., nan, nan, nan], ..., ... [30.3 , 30.35, 30.35, ..., nan, nan, nan], [30.4 , 30.4 , 30.35, ..., nan, nan, nan], [30.45, 30.45, 30.45, ..., nan, nan, nan]], [[15.4 , 15.4 , 15.4 , ..., nan, nan, nan], [15.5 , 15.5 , 15.5 , ..., nan, nan, nan], [15.6 , 15.6 , 15.6 , ..., nan, nan, nan], ..., [27.2 , 27.2 , 27.2 , ..., nan, nan, nan], [27.25, 27.25, 27.2 , ..., nan, nan, nan], [27.3 , 27.25, 27.25, ..., nan, nan, nan]], [[17.55, 17.55, 17.55, ..., nan, nan, nan], [17.5 , 17.55, 17.55, ..., nan, nan, nan], [17.5 , 17.55, 17.55, ..., nan, nan, nan], ..., [30.05, 30. , 29.95, ..., nan, nan, nan], [30.1 , 30.05, 29.95, ..., nan, nan, nan], [30.15, 30.05, 30.05, ..., nan, nan, nan]]], shape=(24412, 187, 261)) Attributes: units: Celsius}) </code></pre> https://stackoverflow.com/q/79821451 0 derM https://stackoverflow.com/users/2056452 2025-11-16T12:19:38Z 2025-11-16T12:19:38Z <p>There has <a href="https://stackoverflow.com/questions/76838097/new-dimension-with-coordinates-in-xarray-apply-ufunc">been at least one other question</a> regarding the introduction of new dimensions in the output of <code>xarray.apply_ufunc</code>; I have two problems with this answer: First, I feel like the answer avoids the core problem, and secondly, there is little to no explanation on the (mis-)conceptions that lead to the problem.</p> <p>Let's consider this (abstract and not working) example:</p> <pre><code>import numpy as np import xarray as xr a1 = np.array([[11,12,13,14],[15,16,17,18],[19,110,111,112],[113,114,115,116]], dtype=np.int16) a2 = np.array([[21,22,23,24],[25,26,27,28],[29,210,211,212],[213,214,215,216]], dtype=np.int16) ds = xr.Dataset( data_vars= { &quot;d1&quot;: ((&quot;y&quot;, &quot;x&quot;), a1), &quot;d2&quot;: ((&quot;y&quot;, &quot;x&quot;), a2) } ).chunk({&quot;y&quot;:2,&quot;x&quot;:2}) def stackuf(xa1, xa2): return np.stack((xa1, xa2), axis=-1) out = xr.apply_ufunc( stackuf, ds.d1, ds.d2, input_core_dims=[[],[]], # what is supposed to be here? output_core_dims=[[]], # and what is supposed to be here? output_sizes=dict(ds.sizes, **{&quot;xy&quot;:2}), # and what is needed there? dask=&quot;parallelized&quot;, vectorize=False, output_dtypes=[np.int16], ) out.compute() </code></pre> <p>What it should do: <em>From two variables with two dimensions (y, x), create a new variable with three dimensions (y, x, xy)</em></p> <p>I suspect the arguments <code>input_core_dims, output_core_dims, output_sizes</code> to be the key arguments to make this working. <strong>But I do not really understand their semantics.</strong></p> <p>Please do not:</p> <ul> <li>just make the code working</li> <li>provide workarounds, that circumvent the need to introduce a new dimensions</li> </ul> <p>instead, explain exactly, what needs to be understood about this - and similar - problems.</p> https://stackoverflow.com/q/71689979 1 David_G https://stackoverflow.com/users/1687736 2022-03-31T09:19:48Z 2025-11-10T23:36:13Z <p>I am looking for a way to return the interpolated coordinate value of a variable at a query variable value. I think it is the opposite use to a normal interpolation/table-look up.</p> <p>For example, here is a simple, normal interpolation, From this very good <a href="https://earth-env-data-science.github.io/lectures/xarray/xarray-part2.html" rel="nofollow noreferrer">tutorial</a>:</p> <pre><code>x_data = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) f = xr.DataArray(x_data**2, dims=['x'], coords={'x': x_data}) f.plot(marker='o') </code></pre> <p>Which produces this plot: <a href="https://i.sstatic.net/WIez6.png" rel="nofollow noreferrer"><img src="https://i.sstatic.net/WIez6.png" alt="simple plot of data" /></a></p> <p>So a standard lookup (e.g. <code>f.interp(x=4.5)</code>) would look for the value of the variable at some coordinate value of <code>x</code>. But I want to look for the value of the <code>x</code> where the variable is equal to some value. For example in the above plot, what is the value of <code>x</code> where the variable equals <code>20</code>.</p> <p>I am ultimately trying to do this for a 3D field of temperature (longitude x latitude x depth). So, how do I find the value of depth, where the temperature field is closest to a value that I specify.</p> <p>Many thanks in advance.</p> https://stackoverflow.com/q/79814554 0 Alexis https://stackoverflow.com/users/31842400 2025-11-09T02:18:31Z 2025-11-09T02:18:31Z <p>Very new to python! I am trying to model bottom water temperatures over time and need to reduce the resolution of my model from 1/20º to 1º. My ultimate goal is to map this and select specific grid cells to calculate temperature for each year. I have tried re-grid but came up with errors when trying to import xesmf. Ive looked into using groupby_bins or the coarsen method but haven't been able to figure out how and if they would actually work.</p> <p>This is the error i get when trying to import xesmf:</p> <pre><code>ModuleNotFoundError: No module named 'xarray.core.arithmetic' </code></pre> <p>this is the work that I have done with coarsen so far:</p> <pre><code>coarsen_factor = 20 v20x = v20x.rename({&quot;nav_lon&quot;: &quot;latitude&quot;, &quot;nav_lat&quot;: &quot;longitude&quot;}) coarsened_ds = v20x.coarsen( latitude=coarsen_factor, longitude=coarsen_factor, boundary='exact' ).mean() </code></pre> <p>But I am getting this error:</p> <pre><code>ValueError: Window dimensions ('latitude', 'longitude') not found in Dataset dimensions ('time_counter', 'deptht', 'axis_nbounds', 'y', 'x') </code></pre> <p>I'm not sure if this is the best approach or if groupby_bins would be better and how to execute this. Any advice would be so appreciated!</p> https://stackoverflow.com/q/79792858 2 Gary Frewin https://stackoverflow.com/users/11571390 2025-10-17T07:41:49Z 2025-10-30T07:30:48Z <p>I’m trying to subset a large <code>xarray.Dataset</code> backed by <code>Dask</code> and save it back to Zarr, but I’m running into a major memory problem when attempting to drop rows with a boolean mask.</p> <p>Here’s a minimal working example that matches my real-world setup:</p> <pre><code>import numpy as np import xarray as xr import dask.array as da import zarr import zipfile # Simulate a large dataset pos_len = 100_000_000 # rows sample_len = 100 # samples chunks = (100_000, 100) data = da.random.random((pos_len, sample_len), chunks=chunks) xds = xr.Dataset( {&quot;some_var&quot;: ((&quot;pos&quot;, &quot;sample_id&quot;), data)}, coords={&quot;pos&quot;: np.arange(pos_len), &quot;sample_id&quot;: np.arange(sample_len)} ) # Build a boolean mask based on mean coverage coverage_array = &quot;some_var&quot; min_coverage = 0.5 mask_1d = xds[coverage_array].mean(dim=&quot;sample_id&quot;, skipna=True) &gt;= min_coverage # Attempt to drop rows where mask is False cds_masked = xds.where(mask_1d.compute(), other=np.nan, drop=True) # &lt;--- memory explodes here # Without .compute() I get: # ValueError: This will result in a dask array of unknown shape. Such arrays are unsupported by Xarray. Please compute the indexer first using .compute() </code></pre> <ul> <li>If I call <code>.compute()</code> on the mask, memory usage explodes and the process crashes.</li> <li>If I skip <code>.compute()</code>, <code>Xarray</code> errors out because it doesn’t know the shape of the result.</li> <li>If I use <code>drop=False</code>, I avoid the error — but then downstream operations have to handle the nans.</li> </ul> <p>My question: Is there any memory-safe way to drop rows with a boolean mask in <code>Xarray</code>/<code>Dask</code> without fully computing the mask first? Or at least, without computing it in a way that explodes into ram and crashes the whole computer.</p> <p>Or is this a fundamental limitation of <code>Xarray</code>/<code>Dask</code> due to unknown chunk shapes after boolean indexing?</p> <p>Are there known workarounds or idioms for this pattern?</p> https://stackoverflow.com/q/52108417 19 Robbi Bishop-Taylor https://stackoverflow.com/users/2510900 2018-08-31T04:26:40Z 2025-10-28T04:19:56Z <p>I have a 1D array of independent variable values (<code>x_array</code>) that match the timesteps in a 3D numpy array of spatial data with multiple time-steps (<code>y_array</code>). My actual data is much larger: 300+ timesteps and up to 3000 * 3000 pixels:</p> <pre><code>import numpy as np from scipy.stats import linregress # Independent variable: four time-steps of 1-dimensional data x_array = np.array([0.5, 0.2, 0.4, 0.4]) # Dependent variable: four time-steps of 3x3 spatial data y_array = np.array([[[-0.2, -0.2, -0.3], [-0.3, -0.2, -0.3], [-0.3, -0.4, -0.4]], [[-0.2, -0.2, -0.4], [-0.3, np.nan, -0.3], [-0.3, -0.3, -0.4]], [[np.nan, np.nan, -0.3], [-0.2, -0.3, -0.7], [-0.3, -0.3, -0.3]], [[-0.1, -0.3, np.nan], [-0.2, -0.3, np.nan], [-0.1, np.nan, np.nan]]]) </code></pre> <p>I want to compute a per-pixel linear regression and obtain R-squared, P-values, intercepts and slopes for each <code>xy</code> pixel in <code>y_array</code>, with values for each timestep in <code>x_array</code> as my independent variable. </p> <p>I can reshape to get the data in a form to input it into <code>np.polyfit</code> which is vectorised and fast:</p> <pre><code># Reshape so rows = number of time-steps and columns = pixels: y_array_reshaped = y_array.reshape(len(y_array), -1) # Do a first-degree polyfit np.polyfit(x_array, y_array_reshaped, 1) </code></pre> <p>However, this ignores pixels that contain any <code>NaN</code> values (<code>np.polyfit</code> does not support <code>NaN</code> values), and does not calculate the statistics I require (R-squared, P-values, intercepts and slopes). </p> <p>The <a href="https://stackoverflow.com/questions/13643363/linear-regression-of-arrays-containing-nans-in-python-numpy">answer here</a> uses <code>scipy.stats import linregress</code> which does calculate the statistics I need, and suggests avoiding <code>NaN</code> issues by masking out these <code>NaN</code> values. However, this example is for two 1D arrays, and I can't work out how to apply a similar masking approach to my case where each column in <code>y_array_reshaped</code> will have a different set of <code>NaN</code> values.</p> <p><strong>My question:</strong> How can I calculate regression statistics for each pixel in a large multidimensional array (300 x 3000 x 3000) containing many <code>NaN</code> values in a reasonably fast, vectorised way?</p> <p><strong>Desired result:</strong> A 3 x 3 array of regression statistic values (e.g. R-squared) for each pixel in <code>y_array</code>, even if that pixel contains <code>NaN</code> values at some point in the time series</p> https://stackoverflow.com/q/79801475 2 thefrollickingnerd https://stackoverflow.com/users/10339757 2025-10-27T05:27:11Z 2025-10-27T06:20:19Z <p>I have a dataset that I need to convert into a rioxarray dataset so I can use regridding features but I can't work out how to convert an already existing xarray object to rioxarray.</p> <p>Unfortunately I can't just load the object in as an rioxarray object as the file is a csv that I convert from a pandas object to an xarray dataset.</p> <pre><code>df = pd.read_csv('file.csv') df=df.set_index([&quot;date&quot;, &quot;latitude&quot;, &quot;longitude&quot;]) ds=df.to_xarray() </code></pre> https://stackoverflow.com/q/79793507 0 Kas Knicely https://stackoverflow.com/users/31617914 2025-10-18T00:25:32Z 2025-10-20T22:00:10Z <p>I'm working in a Jupyter Notebook using Python 3.12. I have a 2D xarray (in reality, it's 3D, but we can treat it as 2D). I want to pull out the values based on indices I acquire elsewhere, and then store those values in a list (or numpy array; haven't decided yet, but the particular container doesn't matter, just that I have the values stored in a container that I can easily access).</p> <p>I have code that works, but it is <em>painfully</em> slow. See below.</p> <pre><code>myValues = [] for idx_pair in myIndices: myValues.append( da[ idx_pair[0],idx_pair[1] ].item() ) </code></pre> <p>In the above, <code>myIndices</code> is a two column array, with each row being the x and y index of <code>da</code>. The amount of index pairs in <code>myIndices</code> can reach as high as 100,000, and I need to loop through several different sets of <code>myIndices</code>. <code>da</code> is a 3D xarray ( <code>da.shape ~ (20, 2000, 2000)</code> which correspond to time, x direction, y direction ), though it can be treated as a 2D array that's about 2000x2000 (just x and y). Getting the values for a 2D <code>da</code> takes about 35 seconds. So this code works (it's the only method I've found that does), but it is far too slow to be useful.</p> <p>**How can I more efficiently and rapidly access values from an xarray using index locations? **</p> <p>I have tried <code>da.load()</code>. Supposedly, that takes care of the lazy loading issue inherent with a lot of xarrays, but it does nothing to reduce the run time in my case.</p> <p>I've tried different ways of accessing the values from the xarray (e.g., <code>.isel(x=myXs, y=myYs)</code> or <code>da[ myXs, myYs ]</code>), but I get weird and insanely large matrices (e.g., ~100,000x100,000 with most of the values being zero). I feel like the solution to my problem is in how I'm accessing the values, but I can't figure out any other method to do it.</p> https://stackoverflow.com/q/47041512 3 Ted Habermann https://stackoverflow.com/users/2726133 2017-10-31T17:43:29Z 2025-10-01T13:09:10Z <p>The default backend engine for xarray is set to netcdf4 in this <a href="https://github.com/pydata/xarray/blob/120e039ddb80510b5fe8be9c20c10362b00c7261/xarray/backends/api.py#L23" rel="nofollow noreferrer">function</a> </p> <p>What is the best way to make h5netcdf the default engine in xarray?</p> https://stackoverflow.com/q/77607224 3 Thomas https://stackoverflow.com/users/14637 2023-12-05T15:04:21Z 2025-09-30T15:58:26Z <p>I have a bunch of NetCDF (<code>.nc</code>) files (ERA5 dataset) that I'm reading in Python through <code>xarray</code> and <code>rioxarray</code>. They end up as arrays of <code>float32</code> (4 bytes) in memory.</p> <p>However, on disk they are stored as <code>short</code> (2 bytes):</p> <pre><code>$ ncdump -h file.nc ... short u100(time, latitude, longitude) ; u100:scale_factor = 0.000895262699529722 ; u100:add_offset = 2.29252111865024 ; u100:_FillValue = -32767s ; u100:missing_value = -32767s ; ... </code></pre> <p>Apparently xarray automatically applies the offset and scale factor to convert these integers back into floats while reading the NetCDF file.</p> <p>Now I'm rechunking these and storing them as zarr, so I can efficiently access entire time series at a single geographical location. However, the zarr files end up at almost twice the size of the original NetCDFs, because the data remain stored as floats. Because it's about a terabyte in its original form, bandwidth and storage considerations are important, so I'd like to make this smaller. And we're not gaining anything by this additional storage size; the incoming data only had 16 bits of precision to begin with.</p> <p>I know I could just manually convert the data back to shorts on the way into zarr, and back to floats on the way out of zarr, but that's tedious and error-prone (<a href="https://stackoverflow.com/questions/75755441/why-does-saving-to-netcdf-without-encoding-change-some-values-to-nan?r=SearchResults&amp;s=25%7C11.9080">even when it happens automatically</a>).</p> <p>Is there a way to do this transparently, the way it seems to happen with NetCDF?</p> https://stackoverflow.com/q/79778444 0 Kurt https://stackoverflow.com/users/25214755 2025-09-29T18:42:27Z 2025-09-29T18:45:39Z <p>I have netCDF files of oceanographic data processed in Python, that I'd like to update the global attributes of (i.e., add the same attributes to a bunch of files). Tried doing it in Xarray per their instructions, but nothing seems to have been updated. Not sure what I'm doing wrong</p> <p>With these metadata:</p> <pre><code>metadata = {'title': 'Time-series data of currents, temperature and waves off Lahaina, Maui, 2023', 'institution':'U.S. Geological Survey'} </code></pre> <p>I'm trying to update files in a folder with this:</p> <pre><code>for file in filelist: with xr.open_dataset(file,mode='a') as ds: ds = ds.attrs.update(metadata) </code></pre> <p>It runs just fine, and according to the file attributes, they have been updated. But when I open the file in Panoply, nothing has been added. the Xarray documentation states that this is a safer way to update a file, as it automatically closes.</p> https://stackoverflow.com/q/79771953 0 Adriano Matos https://stackoverflow.com/users/10902944 2025-09-22T18:40:10Z 2025-09-29T17:58:13Z <pre><code>from joblib import load ntrees_16_model = load(r&quot;ntrees_quantile_16_model_watermask.joblib&quot;) ntrees_50_model = load(r&quot;ntrees_quantile_50_model_watermask.joblib&quot;) ntrees_84_model = load(r&quot;ntrees_quantile_84_model_watermask.joblib&quot;) </code></pre> <p>I am using xarray's <code>xr.map_blocks()</code> to run a function on my data frame, using these joblib files above. When I look at the Dask dashboard, I am seeing that my function is only being processed on one worker, despite having multiple workers that can do processing. I can supply my function, although no one would be able to replicate my issue with this code without supplying all of the input data needed.</p> <pre><code>import numpy as np import pandas as pd def generate_treelist(pixel_df, ntrees_50_model, ntrees_16_model, ntrees_84_model, pixel_resolution): try: pixel_df = pixel_df.rename(columns={'X': 'pixel_x'}) pixel_df = pixel_df.rename(columns={'Y': 'pixel_y'}) # Drop rows containing NaN values pixel_df = pixel_df.dropna() if pixel_df.isnull().values.any(): print(f&quot;[ERROR] NaNs detected in pixel_df after dropping NaN rows.&quot;) return None # Define the sampleDraws function pixel_df = pixel_df.reset_index(drop=True) def sampleDraws(df, num_pulls): try: pull_cols = ['pull_' + str(i+1) for i in range(num_pulls)] random_samples = np.random.normal( df['mu'].values[:, np.newaxis], df['sd'].values[:, np.newaxis], (len(df), num_pulls)) samples_df = pd.DataFrame( random_samples, columns=pull_cols, index=df.index) out_df = pd.concat([df, samples_df], axis=1) return out_df except Exception as e: print(f&quot;[ERROR] Exception occurred in sampleDraws: {e}&quot;) return None # Main treelist generation code starts here pixel_coords = pixel_df[[&quot;pixel_x&quot;, &quot;pixel_y&quot;]] pixel_df = pixel_df.drop( columns=['pixel_x', 'pixel_y', 'time']) desired_columns = [ 'tmmx_1_mean', 'tmmn_1_mean', 'pr_1_mean', 'vs_1_mean', 'rmin_1_mean', 'rmax_1_mean', 'etr_1_mean', 'tmmx_5_mean', 'tmmn_5_mean', 'pr_5_mean', 'vs_5_mean', 'rmin_5_mean', 'rmax_5_mean', 'etr_5_mean', 'tmmx_10_mean', 'tmmn_10_mean', 'pr_10_mean', 'vs_10_mean', 'rmin_10_mean', 'rmax_10_mean', 'etr_10_mean', 'tmmx_20_mean', 'tmmn_20_mean', 'pr_20_mean', 'vs_20_mean', 'rmin_20_mean', 'rmax_20_mean', 'etr_20_mean', 'tmmx_30_mean', 'tmmn_30_mean', 'pr_30_mean', 'vs_30_mean', 'rmin_30_mean', 'rmax_30_mean', 'etr_30_mean', 'water_mask', 'ls_blue', 'ls_green', 'ls_red', 'ls_nir', 'ls_swir1', 'ls_swir2', 'mean_num_of_fires', 'mode_years_since_fire' ] pixel_df = pixel_df[desired_columns] ntrees_50_pred = ntrees_50_model.predict(pixel_df) ntrees_16_pred = ntrees_16_model.predict(pixel_df) ntrees_84_pred = ntrees_84_model.predict(pixel_df) if np.isnan(ntrees_50_pred).any() or np.isnan(ntrees_16_pred).any() or np.isnan(ntrees_84_pred).any(): print(f&quot;[ERROR] NaNs detected in number of trees prediction.&quot;) return None # Store number of trees' mean and standard deviation for uncertainty ntrees_mu = ntrees_50_pred ntrees_sd = abs(ntrees_84_pred - ntrees_16_pred) / 2 df = pd.DataFrame({'mu': ntrees_mu, 'sd': ntrees_sd}) print(f&quot;[INFO] Sampling number of trees...&quot;) random_ntrees = sampleDraws(df, num_pulls=1) if random_ntrees is None: return None random_ntrees_clean = random_ntrees.drop(['mu', 'sd'], axis=1) unscaled_round_ntrees = np.round(random_ntrees_clean).astype(int) random_ntrees_clean[random_ntrees_clean &lt; 0] = 0 current_ntrees = unscaled_round_ntrees.iloc[:, 0] current_ntrees[current_ntrees &lt; 0] = 0 pixel_df = pd.concat([pixel_df, current_ntrees], axis=1) random_coords = np.random.uniform(-pixel_resolution / 2, pixel_resolution / 2, (len(random_ntrees_clean), 2)) repeated_pixel_coords = pixel_coords.values.repeat( current_ntrees, axis=0) tree_coords = repeated_pixel_coords + random_coords # Repeat pixel-level variables to match the length of treelist ntrees_mu_repeated = np.repeat(ntrees_mu, current_ntrees) ntrees_sd_repeated = np.repeat(ntrees_sd, current_ntrees) # Construct final treelist DataFrame with uncertainty columns treelist = pd.DataFrame({ 'pixel_x': repeated_pixel_coords[:, 0], 'pixel_y': repeated_pixel_coords[:, 1], 'tree_x': tree_coords[:, 0], 'tree_y': tree_coords[:, 1], 'n_trees_mu': ntrees_mu_repeated, 'n_trees_sd': ntrees_sd_repeated }) print(treelist) return treelist except Exception as e: print(f&quot;[ERROR] Exception occurred during treelist generation: {e}&quot;) return None </code></pre> <p>Only thing I can think of is there is a &quot;proper&quot; way to load the joblib files to the Dask cluster as it is currently being loaded just in my client. Am I on the right track with my thinking?</p> <p>IMPORTANT EDIT: I realize now that I didn't provide context on the data frame part. I have a wrapper function that converts the xarray into a data frame, then it runs <code>generate_treelist()</code> within this wrapper, and then converts the resulting data frame back into an xarray</p> https://stackoverflow.com/q/79772975 0 Yotam Ben Saadon https://stackoverflow.com/users/23213308 2025-09-23T18:54:04Z 2025-09-25T22:19:58Z <p>I am trying to concatenate 10 netCDF files that are output files from a software named Ichthyop. Each of the files is a result of a lagrangian simulation of particles drifting in the Eastern Mediterranean. The files are of consecutive simulations for the years 2013-2022. They all have the same lon and lat coordinates.</p> <p>Here is an example of their structure using ncdump -h:</p> <pre class="lang-none prettyprint-override"><code>netcdf mercator3d_ichthyop-run202508261903 { dimensions: time = UNLIMITED ; // (831 currently) drifter = 9900 ; type_zone = 3 ; edge = 1224 ; latlon = 2 ; zone0 = 10 ; geozone0 = 5 ; zone1 = 30 ; geozone1 = 5 ; zone2 = 12 ; geozone2 = 6 ; variables: double time(time) ; time:calendar = &quot;gregorian&quot; ; time:units = &quot;seconds since 1900-01-01 00:00&quot; ; float lon(time, drifter) ; lon:long_name = &quot;particle longitude&quot; ; lon:unit = &quot;degree east&quot; ; float lat(time, drifter) ; lat:long_name = &quot;particle latitude&quot; ; lat:unit = &quot;degree north&quot; ; int mortality(time, drifter) ; mortality:long_name = &quot;particle cause of mortality&quot; ; mortality:unit = &quot;death cause&quot; ; mortality:alive = 0 ; mortality:dead_cold = 1 ; mortality:out_of_domain = 2 ; mortality:old = 3 ; mortality:beached = 4 ; mortality:starvation = 5 ; mortality:dead_hot = 6 ; mortality:dead_fresh = 7 ; mortality:dead_saline = 8 ; float depth(time, drifter) ; depth:long_name = &quot;particle depth&quot; ; depth:unit = &quot;meter&quot; ; int zone(time, drifter, type_zone) ; zone:long_name = &quot;In which zones is the particle ?&quot; ; zone:unit = &quot;the index of the zone&quot; ; zone:type_zone_0 = &quot;release&quot; ; zone:release_zone_0 = &quot;zone1&quot; ; zone:release_zone_1 = &quot;zone2&quot; ; zone:release_zone_2 = &quot;zone3&quot; ; zone:type_zone_1 = &quot;recruitment&quot; ; zone:recruitment_zone = &quot;none for this run&quot; ; zone:type_zone_2 = &quot;target&quot; ; zone:target_zone = &quot;none for this run&quot; ; zone:not_released_yet = -99 ; zone:out_of_zone = -1 ; int release_zone(drifter) ; release_zone:long_name = &quot;In which zone has been released the particle ?&quot; ; release_zone:unit = &quot;the index of the release zone&quot; ; release_zone:release_zone_0 = &quot;zone1&quot; ; release_zone:release_zone_1 = &quot;zone2&quot; ; release_zone:release_zone_2 = &quot;zone3&quot; ; release_zone:not_released_yet = -99 ; float region_edge(edge, latlon) ; region_edge:long_name = &quot;geoposition of region edge&quot; ; region_edge:unit = &quot;lat degree north lon degree east&quot; ; int drifter(drifter) ; drifter:long_name = &quot;drifter index&quot; ; drifter:unit = &quot;&quot; ; float coord_zone0(zone0, latlon) ; coord_zone0:long_name = &quot;zone1&quot; ; coord_zone0:unit = &quot;x and y coordinates of the center of the cells in the zone&quot; ; coord_zone0:type = &quot;release&quot; ; coord_zone0:color = &quot;[r=255,g=0,b=255]&quot; ; float coord_geo_zone0(geozone0, latlon) ; float coord_zone1(zone1, latlon) ; coord_zone1:long_name = &quot;zone2&quot; ; coord_zone1:unit = &quot;x and y coordinates of the center of the cells in the zone&quot; ; coord_zone1:type = &quot;release&quot; ; coord_zone1:color = &quot;[r=255,g=0,b=255]&quot; ; float coord_geo_zone1(geozone1, latlon) ; float coord_zone2(zone2, latlon) ; coord_zone2:long_name = &quot;zone3&quot; ; coord_zone2:unit = &quot;x and y coordinates of the center of the cells in the zone&quot; ; coord_zone2:type = &quot;release&quot; ; coord_zone2:color = &quot;[r=255,g=0,b=255]&quot; ; float coord_geo_zone2(geozone2, latlon) ; // global attributes: :transport_dimension = &quot;3d&quot; ; :release.schedule.is_enabled = &quot;true&quot; ; :release.schedule.events = &quot;\&quot;year 2013 month 06 day 01 at 12:00\&quot; \&quot;year 2013 month 06 day 03 at 12:00\&quot; \&quot;year 2013 month 06 day 06 at 12:00\&quot; \&quot;year 2013 month 06 day 09 at 12:00\&quot; \&quot;year 2013 month 06 day 12 at 12:00\&quot; \&quot;year 2013 month 06 day 15 at 12:00\&quot; \&quot;year 2013 month 06 day 18 at 12:00\&quot; \&quot;year 2013 month 06 day 21 at 12:00\&quot; \&quot;year 2013 month 06 day 24 at 12:00\&quot; \&quot;year 2013 month 06 day 27 at 12:00\&quot; \&quot;year 2013 month 06 day 30 at 12:00\&quot; \&quot;year 2013 month 07 day 03 at 12:00\&quot; \&quot;year 2013 month 07 day 06 at 12:00\&quot; \&quot;year 2013 month 07 day 09 at 12:00\&quot; \&quot;year 2013 month 07 day 12 at 12:00\&quot; \&quot;year 2013 month 07 day 15 at 12:00\&quot; \&quot;year 2013 month 07 day 18 at 12:00\&quot; \&quot;year 2013 month 07 day 21 at 12:00\&quot; \&quot;year 2013 month 07 day 24 at 12:00\&quot; \&quot;year 2013 month 07 day 27 at 12:00\&quot; \&quot;year 2013 month 07 day 30 at 12:00\&quot; \&quot;year 2013 month 08 day 02 at 12:00\&quot; \&quot;year 2013 month 08 day 15 at 12:00\&quot; \&quot;year 2013 month 08 day 30 at 12:00\&quot; \&quot;year 2013 month 09 day 01 at 12:00\&quot; \&quot;year 2013 month 09 day 15 at 12:00\&quot; \&quot;year 2013 month 09 day 30 at 12:00\&quot; \&quot;year 2013 month 10 day 01 at 12:00\&quot; \&quot;year 2013 month 10 day 15 at 12:00\&quot; \&quot;year 2013 month 10 day 31 at 12:00\&quot; \&quot;year 2013 month 11 day 01 at 12:00\&quot; \&quot;year 2013 month 11 day 15 at 12:00\&quot; \&quot;year 2013 month 11 day 30 at 12:00\&quot;&quot; ; :app.time.initial_time = &quot;year 2013 month 01 day 01 at 12:00&quot; ; :app.time.transport_duration = &quot;0030 day(s) 00 hour(s) 00 minute(s)&quot; ; :app.time.time_step = &quot;3150&quot; ; :app.output.netcdf_output_format = &quot;netcdf3&quot; ; :app.output.file_prefix = &quot;mercator3d&quot; ; :app.output.output_path = &quot;/home/yotam/Documents/oceanographic_processes_biological_implications/lionfish_dispersal/output/&quot; ; :app.output.record_frequency = &quot;12&quot; ; :app.particle_length.initial_length = &quot;0.89&quot; ; :app.particle_length.hatch_length = &quot;2.09&quot; ; :app.particle_length.yolk2feeding_length = &quot;2.6&quot; ; :app.transport.coastline_behavior = &quot;Bouncing&quot; ; :action.recruitment.stain.enabled = &quot;false&quot; ; :action.lethal_temp.enabled = &quot;false&quot; ; :action.recruitment.zone.enabled = &quot;false&quot; ; :action.advection.enabled = &quot;true&quot; ; :action.advection.scheme = &quot;Runge Kutta 4&quot; ; :action.swimming.enabled = &quot;false&quot; ; :action.growth.enabled = &quot;false&quot; ; :action.hdisp.enabled = &quot;true&quot; ; :action.hdisp.epsilon = &quot;1E-9&quot; ; :action.wind.enabled = &quot;false&quot; ; :release.txtfile.enabled = &quot;false&quot; ; :release.ncfile.enabled = &quot;false&quot; ; :release.zone.enabled = &quot;true&quot; ; :release.zone.number_particles = &quot;300&quot; ; :release.zone.zone_file = &quot;/home/yotam/Documents/oceanographic_processes_biological_implications/ichthyop_data/cfg/release_east_med_lev.xml&quot; ; :release.patches.enabled = &quot;false&quot; ; :release.stain.enabled = &quot;false&quot; ; :dataset.mercator3d.enabled = &quot;true&quot; ; :dataset.mercator3d.input_path = &quot;/home/yotam/Documents/oceanographic_processes_biological_implications/lionfish_dispersal/bashscripts/bashscripts_all/copernicusdata/&quot; ; :dataset.mercator3d.hgr_pattern = &quot;MED-MFC_006_004_coordinates_cropped.nc&quot; ; :dataset.mercator3d.zgr_pattern = &quot;MED-MFC_006_004_coordinates_cropped.nc&quot; ; :dataset.mercator3d.byte_mask_pattern = &quot;MED-MFC_006_004_mask_bathy_cropped.nc&quot; ; :dataset.mercator3d.gridu_pattern = &quot;*stks_data_*.nc&quot; ; :dataset.mercator3d.gridv_pattern = &quot;*stks_data_*.nc&quot; ; :dataset.mercator3d.gridt_pattern = &quot;*stks_data_*.nc&quot; ; :dataset.mercator3d.field_dim_x = &quot;longitude&quot; ; :dataset.mercator3d.field_dim_y = &quot;latitude&quot; ; :dataset.mercator3d.field_dim_z = &quot;depth&quot; ; :dataset.mercator3d.field_dim_time = &quot;time&quot; ; :dataset.mercator3d.field_var_lon = &quot;longitude&quot; ; :dataset.mercator3d.field_var_lat = &quot;latitude&quot; ; :dataset.mercator3d.field_var_mask = &quot;mask&quot; ; :dataset.mercator3d.field_var_u = &quot;uo&quot; ; :dataset.mercator3d.field_var_v = &quot;vo&quot; ; :dataset.mercator3d.field_var_time = &quot;time&quot; ; :dataset.mercator3d.field_var_e3t = &quot;e3t&quot; ; :dataset.mercator3d.field_var_e2t = &quot;e2t&quot; ; :dataset.mercator3d.field_var_e1t = &quot;e1t&quot; ; :xml_file = &quot;/home/yotam/Documents/oceanographic_processes_biological_implications/ichthyop_data/cfg/cfg-mercator3d-phd_1.xml&quot; ; :nb_zones = 3 ; } </code></pre> <p>All 10 files are identical, with the exception of the attributes <code>app.time.initial.time</code> which changes according to the year of the simulation and the <code>release.schedule.events</code>, which also change according to the year, but otherwise stay the same, e.g for 2013 <code>&quot;\&quot;year 2013 month 06 day 01 at 12:00\&quot; \&quot;year 2013 month 06 day 03 at 12:00\&quot; \&quot;year 2013 month 06 day 06 at 12:00\&quot;</code> and etc., for 2014 <code>&quot;\&quot;year 2014 month 06 day 01 at 12:00\&quot; \&quot;year 2014 month 06 day 03 at 12:00\&quot; \&quot;year 2014 month 06 day 06 at 12:00\&quot;</code> and so on.</p> <p>What I need, but haven't found a way to do yet, since I have limited xarray and python netcdf knowledge is how when concatenating them, to not only keep and concatenate the <code>release.schedule.events</code> attribute, but also to find a way to add to the drifters the attribute of year of release, in order to differentiate the drifters between the different files, since they are all named 0-9899 in all of the files (drifter index).</p> <p>Here is an example of how they appear:</p> <pre class="lang-none prettyprint-override"><code>Out[22]: &lt;xarray.DataArray 'drifter' (drifter: 9900)&gt; Size: 40kB array([ 0, 1, 2, ..., 9897, 9898, 9899], shape=(9900,), dtype=int32) Coordinates: * drifter (drifter) int32 40kB 0 1 2 3 4 5 ... 9894 9895 9896 9897 9898 9899 Attributes: long_name: drifter index unit: </code></pre> <p>Besides using python, I have also tried both the CDO and NCO (ncrcat and mergetime) but those also didn't work. Any help on how to do this in any possible method would be much appreciated, although ideally I would prefer to solve this using xarray in order to improve my xarray knowledge.</p> <p>Thanks in advance to any help provided</p> <p><strong>EDIT</strong></p> <p>Here is a link to the 10 files I am trying to concatenate, while also keeping the attributes of each file and re-indexing the drifter dimension: <a href="https://wetransfer.com/downloads/52f893c68d89214d756e2bb0ce7e9ae020250925141818/0cca35?t_exp=1759069098&amp;t_lsid=3e703ea7-c1fc-469d-921c-2bf0c2b12e87&amp;t_network=link&amp;t_rid=ZW1haWx8YWRyb2l0fGRmMjlkNWM4LTdhODEtNDJiNC05MmVhLWYxZGEyYWU4NmQ0NA==&amp;t_s=download_link&amp;t_ts=1758809898" rel="nofollow noreferrer">link_to_files</a></p> https://stackoverflow.com/q/79765956 0 ibrahimin https://stackoverflow.com/users/29140944 2025-09-16T08:21:49Z 2025-09-17T15:58:18Z <p>I calculated the Growing Season Length (GSL) index for the 1950–2023 period in Turkey using ERA5-Land daily mean temperature data. According to the definition:</p> <ul> <li>First, find the first occurrence of at least 6 consecutive days with daily mean temperature &gt; 5 °C.</li> <li>Then, after July 1st (NH), find the first occurrence of at least 6 consecutive days with daily mean temperature &lt; 5 °C.</li> <li>The number of days between these two dates is recorded as GSL.</li> </ul> <p>The issue:</p> <ul> <li>In coastal areas or warmer regions, the second condition (6 consecutive days &lt; 5 °C) may never occur in some years, so GSL cannot be calculated and remains NaN.</li> <li>For example, between 2000–2023, in some pixels GSL is missing for years like 2004, 2007, 2009, and 2012.</li> <li>When I compute trends (e.g., Mann-Kendall), these missing years result in artificially high positive or negative trends in some pixels, which are not realistic.</li> </ul> <p>My questions:</p> <ul> <li>How should I handle such missing years before performing trend analysis?</li> </ul> <p>What I tried:</p> <ul> <li>Implemented GSL following the ETCCDI definition with ERA5-Land daily mean temperatures (1950–2023).</li> <li>Calculated GSL pixel-wise using xarray + numpy.</li> <li>Applied Mann-Kendall trend test on the yearly GSL time series. What I expected:</li> <li>A realistic spatial distribution of GSL trends, e.g., gradual lengthening or shortening in certain regions.</li> <li>Missing years to have minimal influence on long-term trends. What happened instead:</li> <li>Pixels with missing years (due to no 6-day &lt;5 °C period) show very large artificial trends (sometimes &gt;±10 days/decade), which are not climatologically reasonable.</li> </ul> https://stackoverflow.com/q/79753007 0 Nanoputian https://stackoverflow.com/users/10355850 2025-09-02T01:45:34Z 2025-09-04T19:17:30Z <p>I have a dasked xarray which is about 150k x 90k with chunk size of 8192 x 8192. I am working on a Window virtual machine which has 100gb RAM and 16 cores.</p> <p>I want to plot it using the <code>Datashader</code> package. From reading the <a href="https://datashader.org/user_guide/Performance.html" rel="nofollow noreferrer">docs</a>, <code>Datashader</code> does support dasked xarray. I am a beginner in using <code>Dask</code>, though my (limited) understanding is that since the xarray is chuncked, <code>Datashader</code> should only need to read one chunk at a time (or something to that effect) and do the required processing to produce the plot, so it doesn't need to read the entire xarray in-memory. Moreover, it should be able to leverage parallelism to speed up this process.</p> <p>However, when I try to create a plot after about 15 seconds I see the RAM starts increasing steadily (and CPU &gt; 95%) until I eventually manually interrupt the code when it reaches about 90gb in RAM. After interrupting, below is a part of the error message.</p> <pre><code>&gt; KeyboardInterrupt Traceback (most recent call &gt; last) Cell In[4], line 1 &gt; ----&gt; 1 tf.shade(ds.Canvas(plot_height=300, plot_width=300).raster(dask_xarray)) &gt; &gt; File &gt; &lt;PATH&gt;\lib\site-packages\datashader\transfer_functions\__init__.py:717, &gt; in shade(agg, cmap, color_key, how, alpha, min_alpha, span, name, &gt; color_baseline, rescale_discrete_levels) &gt; 713 return _apply_discrete_colorkey( &gt; 714 agg, color_key, alpha, name, color_baseline &gt; 715 ) &gt; 716 else: &gt; --&gt; 717 return _interpolate(agg, cmap, how, alpha, span, min_alpha, name, &gt; 718 rescale_discrete_levels) &gt; 719 elif agg.ndim == 3: &gt; 720 return _colorize(agg, color_key, how, alpha, span, min_alpha, name, color_baseline, &gt; 721 rescale_discrete_levels) &gt; &gt; File &gt; &lt;PATH&gt;\lib\site-packages\datashader\transfer_functions\__init__.py:256, &gt; in _interpolate(agg, cmap, how, alpha, span, min_alpha, name, &gt; rescale_discrete_levels) &gt; 254 data = agg.data &gt; 255 if isinstance(data, da.Array): &gt; --&gt; 256 data = data.compute() &gt; 257 else: &gt; 258 data = data.copy() </code></pre> <p>From the above error message, it seems like <code>compute</code> is being run on the xarray which would perhaps explain why the RAM is increasing so much? If that is the case, I am confused why <code>compute</code> is being run since I thought the whole point of using dasked xarrays is to avoid reading the entire xarray. I would have thought it should have been possible to even create this plot using a regular computer with 16gb RAM, with the main limiting factor being the chunk size and some extra space for dask/datashader to work in the background.</p> <p>I have almost certainly mis-understood some things. It would be much appreciated if anyone could explain what is happening here and how I can go about plotting the array that I have.</p> <p><strong>Minimal reproducible example</strong></p> <pre><code># import libraries import numpy as np import dask.array as da import datashader as ds from datashader import transfer_functions as tf import xarray as xr # create large dask array dask_array = da.random.random((100000, 100000), chunks=(1000, 1000)) # convert to dasked xarray dask_xarray = xr.DataArray( dask_array, dims=[&quot;x&quot;, &quot;y&quot;], coords={&quot;x&quot;: np.arange(100000), &quot;y&quot;: np.arange(100000)}, name=&quot;example_data&quot; # Name of the data variable ) # create plot using Datashader tf.shade(ds.Canvas(plot_height=300, plot_width=300).raster(dask_xarray)) </code></pre> https://stackoverflow.com/q/79755615 1 dtm34 https://stackoverflow.com/users/31417954 2025-09-04T10:44:57Z 2025-09-04T17:58:05Z <p>I'm currently trying to resample a large geotiff file to a coarser resolution. This file contains classes of tree species (indicated by integer values) at each pixel, so I want to resample each block (3x3 or 2x2) to contain the most often occurring integer/tree species (ignoring nan/fill value).</p> <p>My first approach was using <code>scipy.stats.mode</code>, which does exactly that, but it it made RAM explode, no matter if I use raw numpy or dask. (See my issue at github: <a href="https://github.com/pydata/xarray/issues/10636" rel="nofollow noreferrer">link</a>).</p> <p>The actual data is just 20 % of my RAM (roughly 5.5 GB, shape: (86145, 64074) of <code>uint8</code>), so it should be possible (I think). Dask also cannot run this lazily and computes instant, which is also confusing to me (and the reason for my initial question in the issue). It just keeps allocating RAM until my RAM and SWAP is full and my laptop collapses.</p> <p>In my head this task is actually quite simple:</p> <ol> <li>Reshape the array to contain 2 new dimensions which represent the reduced window at each new location (which is exactly what <code>xr.coarsen</code> or <code>skimage.measure.block_reduce</code> does)</li> <li>For each new location apply a function to all the values in the window, in my case mode.</li> <li>Result is the same dataset in coarser resolution</li> </ol> <p>I'm probably a bit naive and its not that easy, but I cant wrap my head around why this is so hard to do, it seems to not be that hard of a computational task. I tried many ways, from using <code>np.bincount</code> and <code>np.unique</code> to implementing something myself in python (and using <code>numba</code>), to reshaping everything into 2 dimensional data, but nothing works. Is numpy/python simply not the right tool?</p> <p>So my question is:</p> <ol> <li>How can I solve this efficiently using numpy?</li> </ol> <p>Example code to reproduce:</p> <pre class="lang-py prettyprint-override"><code>import xarray as xr import dask.array as da import numpy as np from scipy import stats # Define dimensions nx, ny, nt = 3000, 300, 100 # size of each dimension chunks = (300, 30, 10) # chunk sizes for Dask # Create Dask arrays data1 = da.random.random((nx, ny, nt), chunks=chunks) data2 = da.random.random((nx, ny, nt), chunks=chunks) # Create coordinates x = np.linspace(0, 10, nx) y = np.linspace(0, 5, ny) time = np.arange(nt) # Build the xarray Dataset ds = xr.Dataset( { &quot;temperature&quot;: ((&quot;x&quot;, &quot;y&quot;, &quot;time&quot;), data1), &quot;precipitation&quot;: ((&quot;x&quot;, &quot;y&quot;, &quot;time&quot;), data2), }, coords={ &quot;x&quot;: x, &quot;y&quot;: y, &quot;time&quot;: time, } ) # custom function for accessing the mode def find_mode(arr, axis): m, _ = stats.mode(arr, axis=axis) return m # this is lazy! coarse_mean_ds = ds.coarsen(x=3, y=3, boundary='pad').reduce(np.mean) # this computes on spotand and allocates far too much RAM in a single worker! maj_vote_coarse = ds.coarsen(x=3, y=3, boundary='pad').reduce(find_mode) </code></pre> https://stackoverflow.com/q/49933327 15 Rich Signell https://stackoverflow.com/users/2005869 2018-04-20T03:09:43Z 2025-08-27T17:05:42Z <p>I'm trying to rechunk a NetCDF file collection and create a Zarr dataset on AWS S3. I have 168 original NetCDF4 classic files with arrays of dimension <code>time: 1, y: 3840, x: 4608</code> chunked as <code>chunks={'time':1, 'y':768, 'x':922}</code>.</p> <p>I want to write this output to Zarr, and I want to optimize for time series extraction, so include more time records in my chunks. I figured I would use xarray to help get the job done, since I have a lot of processors available to take advantage of Dask, and xarray has both <code>xr.open_mfdataset</code> and <code>ds.to_zarr</code>.</p> <p>I first tried rechunking to <code>chunks={'time':24, 'y':768, 'x':922}</code> to match the input NetCDF4 chunking in <code>x</code> and <code>y</code>, but when I tried to write to Zarr it complained because it needed uniform chunk sizes in both <code>x</code> and <code>y</code>, only allowing non-uniform size in the last chunk along the <code>time</code> dimension (and unfortunately in the <code>x</code> dimension, the total size 4608 is not a multiple of chunk size 922.</p> <p>So then I tried <code>chunks={'time':168, 'y':384, 'x':288}</code> and that started working, and proceeded very quickly for a few minutes, then got slower and slower. Eventually after 50 minutes, the cluster died with:</p> <pre><code>4072 distributed.core - INFO - Event loop was unresponsive in Worker for 1.41s. This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability. 4073 slurmstepd: error: Step 3294889.0 exceeded memory limit (25346188 &gt; 25165824), being killed </code></pre> <p>Here's the code I'm using:</p> <pre><code>from dask.distributed import Client import pandas as pd import xarray as xr import s3fs import zarr client = Client(scheduler_file='/home/rsignell/scheduler.json') client </code></pre> <p><a href="https://i.sstatic.net/F4zyZ.png" rel="noreferrer"><img src="https://i.sstatic.net/F4zyZ.png" alt="enter image description here"></a></p> <pre><code>root = '/lustre/projects/hazards/cmgp/woodshole/rsignell/nwm/forcing_short_range/' bucket_endpoint='https://s3.us-west-1.amazonaws.com/' f_zarr = 'rsignell/nwm/test_week4' dates = pd.date_range(start='2018-04-01T00:00', end='2018-04-07T23:00', freq='H') urls = ['{}{}/nwm.t{}z.short_range.forcing.f001.conus.nc'.format(root,a.strftime('%Y%m%d'),a.strftime('%H')) for a in dates] ds = xr.open_mfdataset(urls, concat_dim='time', chunks={'time':1, 'y':768, 'x':922}) ds = ds.drop(['ProjectionCoordinateSystem','time_bounds']) ds = ds.chunk(chunks={'time':168, 'y':384, 'x':288}).persist() ds </code></pre> <p>producing</p> <pre><code>&lt;xarray.Dataset&gt; Dimensions: (reference_time: 168, time: 168, x: 4608, y: 3840) Coordinates: * reference_time (reference_time) datetime64[ns] 2018-04-01 ... * x (x) float64 -2.304e+06 -2.303e+06 -2.302e+06 -2.301e+06 ... * y (y) float64 -1.92e+06 -1.919e+06 -1.918e+06 -1.917e+06 ... * time (time) datetime64[ns] 2018-04-01T01:00:00 ... Data variables: T2D (time, y, x) float64 dask.array&lt;shape=(168, 3840, 4608), chunksize=(168, 384, 288)&gt; LWDOWN (time, y, x) float64 dask.array&lt;shape=(168, 3840, 4608), chunksize=(168, 384, 288)&gt; Q2D (time, y, x) float64 dask.array&lt;shape=(168, 3840, 4608), chunksize=(168, 384, 288)&gt; U2D (time, y, x) float64 dask.array&lt;shape=(168, 3840, 4608), chunksize=(168, 384, 288)&gt; V2D (time, y, x) float64 dask.array&lt;shape=(168, 3840, 4608), chunksize=(168, 384, 288)&gt; PSFC (time, y, x) float64 dask.array&lt;shape=(168, 3840, 4608), chunksize=(168, 384, 288)&gt; RAINRATE (time, y, x) float32 dask.array&lt;shape=(168, 3840, 4608), chunksize=(168, 384, 288)&gt; SWDOWN (time, y, x) float64 dask.array&lt;shape=(168, 3840, 4608), chunksize=(168, 384, 288)&gt; </code></pre> <p>Then I call</p> <pre><code>fs = s3fs.S3FileSystem(anon=False, client_kwargs=dict(endpoint_url=bucket_endpoint)) d = s3fs.S3Map(f_zarr, s3=fs) compressor = zarr.Blosc(cname='zstd', clevel=3, shuffle=2) encoding = {vname: {'compressor': compressor} for vname in ds.data_vars} delayed_store = ds.to_zarr(store=d, mode='w', encoding=encoding, compute=False) persist_store = delayed_store.persist(retries=100) </code></pre> <p>and just before it dies, the Dask Daskboard looks like this:</p> <p><a href="https://i.sstatic.net/Q6EDn.png" rel="noreferrer"><img src="https://i.sstatic.net/Q6EDn.png" alt="enter image description here"></a> <a href="https://i.sstatic.net/D1DQS.png" rel="noreferrer"><img src="https://i.sstatic.net/D1DQS.png" alt="enter image description here"></a></p> <p>The total size of the NetCDF4 files is 20GB, so it seems a bit crazy that I've got over 500GB showing in the Dask Dashboard, and that 30 processors each with 60GB RAM is not sufficient for the job. </p> <p>What am I doing wrong, or what would be a better solution?</p> https://stackoverflow.com/q/54777638 3 alextc https://stackoverflow.com/users/1021306 2019-02-20T01:50:05Z 2025-08-22T12:16:50Z <p>I need to set a title for multiple plots each of which is drawn with Basemap.</p> <p>The souce data is an xarray's DataArray object. The code uses DataArray.plot() to draw multiple plots on a single figure.</p> <pre><code>p = da_monthly_slice.plot(levels=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], cmap=plt.cm.Reds, x='longitude', y='latitude', col='time', col_wrap=3) for i, ax in enumerate(p.axes.flatten()): ax.set_title('Month: %d' % i) ax.set_xlabel('Longitude') ax.set_ylabel('Latitude') map = Basemap(llcrnrlat=-39.2,urcrnrlat=-33.9,llcrnrlon=140.8,urcrnrlon=150.0,resolution='i',ax=ax) map.readshapefile(shapefile1, 'CFA_DISTRICT_BODY', linewidth=0.5) fig = plt.figure() st = fig.suptitle("Number of days for each month meeting the criteria in 2017", fontsize="large") plt.show() </code></pre> <p>This caused two figures drawn separately. One is for the multiple plots while the other one contains the title only.</p> <p>How to make the title to show on the top of the multiple plots on the same figure?</p> https://stackoverflow.com/q/79741812 0 rock0789 https://stackoverflow.com/users/19349186 2025-08-21T04:13:09Z 2025-08-21T04:48:19Z <p>I use Python version 3.9.18 to reading wrfout files (name like: <code>wrfout_d02_2020-01-01_00:00:00</code>) and get T2, Q2, PSFC, U10, V10, ACSWDNB variables and combine all days in the month to a output netcdf file (name like : WRF_PGW_2C_re-202001-d02-hourly.nc)。</p> <p>Here is the code:</p> <pre><code>import xarray as xr import glob from metpy.units import units import numpy as np import pandas as pd import calendar from tqdm import tqdm def get_file_list(input_dir: str, year: int, month: int) -&gt; list: &quot;&quot;&quot; Get list of WRF output files for a given year and month. Args: input_dir (str): Directory containing WRF output files. year (int): Year of the data. month (int): Month of the data. Returns: list: Sorted list of file paths. &quot;&quot;&quot; file_pattern = f'{input_dir}wrfout_d02_{year}-{month:02}-*_00:00:00' return sorted(glob.glob(file_pattern)) def process_single_file(file: str) -&gt; xr.Dataset: &quot;&quot;&quot; Process a single WRF output file and extract specified variables. Args: file (str): Path to the WRF output file. Returns: xr.Dataset: Dataset with extracted variables and units. &quot;&quot;&quot; ds = xr.open_dataset(file, engine='netcdf4') # Extract variables with units data_vars = { 'Times': ds['Times'], 'T2': ds['T2'] * units.kelvin, 'Q2': ds['Q2'] * units('kg/kg'), 'PSFC': ds['PSFC'] * units.pascal, 'U10': ds['U10'] * units('m/s'), 'V10': ds['V10'] * units('m/s'), 'ACSWDNB': ds['ACSWDNB'] * units('J/m^2'), } # Create new dataset single_ds = xr.Dataset(data_vars) # Copy attributes for var in data_vars: single_ds[var].attrs = ds[var].attrs ds.close() return single_ds def merge_datasets(datasets: list, year: int, month: int) -&gt; xr.Dataset: &quot;&quot;&quot; Merge datasets along the 'ymd' dimension. Args: datasets (list): List of xarray Datasets. year (int): Year of the data. month (int): Month of the data. Returns: xr.Dataset: Merged dataset. &quot;&quot;&quot; # merged_ds = xr.concat(datasets, dim='ymd', data_vars='minimal', coords='minimal') merged_ds = xr.concat(datasets, dim='ymd') return merged_ds.transpose('ymd', 'Time', 'south_north', 'west_east') def save_to_netcdf(merged_ds: xr.Dataset, output_path: str, year: int, month: int) -&gt; None: &quot;&quot;&quot; Save merged dataset to a NetCDF file. Args: merged_ds (xr.Dataset): Merged dataset to save. output_path (str): Directory to save the NetCDF file. year (int): Year of the data. month (int): Month of the data. &quot;&quot;&quot; output_file = f'WRF_PGW_2C_re-{year}{month:02}-d02-hourly.nc' merged_ds.to_netcdf( f'{output_path}{output_file}', format='NETCDF4', engine='netcdf4', encoding={ 'Times': {'dtype': 'S1'}, 'T2': {'dtype': 'float32'}, 'Q2': {'dtype': 'float32'}, 'PSFC': {'dtype': 'float32'}, 'U10': {'dtype': 'float32'}, 'V10': {'dtype': 'float32'}, 'ACSWDNB': {'dtype': 'float32'}, } ) def main(): &quot;&quot;&quot; Main function to process WRF output files, merge daily data into monthly files, and save as NetCDF. &quot;&quot;&quot; base_input_path = '/home/user/work/share/WRF3type/WRF_PGW/2C_re/' #for PGW_2C run output_path = '/home/user/work/share/WRF3type/hrdata-TCCIP-WRF3type-nc/PGW_2C/' years = range(2020, 2021) for year in tqdm(years, desc=&quot;Processing years&quot;): year_folder = f'2C_re-{year}-PGW_wrfout_ERA5_domainv2_L38/' input_dir = f'{base_input_path}{year_folder}test3/' for month in tqdm(range(1, 2), desc=f&quot;Processing months for {year}&quot;, leave=False): # Get file list file_list = get_file_list(input_dir, year, month) # Process each file with progress bar datasets = [process_single_file(file) for file in tqdm(file_list, desc=f&quot;Processing files for {year}-{month:02}&quot;, leave=False)] # Merge datasets merged_ds = merge_datasets(datasets, year, month) # Save to NetCDF save_to_netcdf(merged_ds, output_path, year, month) if __name__ == &quot;__main__&quot;: main() </code></pre> <p>Then <code>ncdump -c</code> the netcdf file:</p> <pre><code>netcdf WRF_PGW_2C_re-202001-d02-hourly { dimensions: ymd = 31 ; Time = 24 ; string19 = 19 ; south_north = 230 ; west_east = 150 ; variables: char Times(ymd, Time, string19) ; Times:coordinates = &quot;XTIME&quot; ; float T2(ymd, Time, south_north, west_east) ; T2:_FillValue = NaNf ; T2:FieldType = 104 ; T2:MemoryOrder = &quot;XY &quot; ; T2:description = &quot;TEMP at 2 M&quot; ; T2:units = &quot;K&quot; ; T2:stagger = &quot;&quot; ; T2:coordinates = &quot;XLAT XLONG XTIME&quot; ; float Q2(ymd, Time, south_north, west_east) ; Q2:_FillValue = NaNf ; Q2:FieldType = 104 ; Q2:MemoryOrder = &quot;XY &quot; ; Q2:description = &quot;QV at 2 M&quot; ; Q2:units = &quot;kg kg-1&quot; ; Q2:stagger = &quot;&quot; ; Q2:coordinates = &quot;XLAT XLONG XTIME&quot; ; float PSFC(ymd, Time, south_north, west_east) ; PSFC:_FillValue = NaNf ; PSFC:FieldType = 104 ; PSFC:MemoryOrder = &quot;XY &quot; ; PSFC:description = &quot;SFC PRESSURE&quot; ; PSFC:units = &quot;Pa&quot; ; PSFC:stagger = &quot;&quot; ; PSFC:coordinates = &quot;XLAT XLONG XTIME&quot; ; float U10(ymd, Time, south_north, west_east) ; U10:_FillValue = NaNf ; U10:FieldType = 104 ; U10:MemoryOrder = &quot;XY &quot; ; U10:description = &quot;U at 10 M&quot; ; U10:units = &quot;m s-1&quot; ; U10:stagger = &quot;&quot; ; U10:coordinates = &quot;XLAT XLONG XTIME&quot; ; float V10(ymd, Time, south_north, west_east) ; V10:_FillValue = NaNf ; V10:FieldType = 104 ; V10:MemoryOrder = &quot;XY &quot; ; V10:description = &quot;V at 10 M&quot; ; V10:units = &quot;m s-1&quot; ; V10:stagger = &quot;&quot; ; V10:coordinates = &quot;XLAT XLONG XTIME&quot; ; float ACSWDNB(ymd, Time, south_north, west_east) ; ACSWDNB:_FillValue = NaNf ; ACSWDNB:FieldType = 104 ; ACSWDNB:MemoryOrder = &quot;XY &quot; ; ACSWDNB:description = &quot;ACCUMULATED DOWNWELLING SHORTWAVE FLUX AT BOTTOM&quot; ; ACSWDNB:units = &quot;J m-2&quot; ; ACSWDNB:stagger = &quot;&quot; ; ACSWDNB:coordinates = &quot;XLAT XLONG XTIME&quot; ; float XTIME(ymd, Time) ; XTIME:_FillValue = NaNf ; XTIME:FieldType = 104 ; XTIME:MemoryOrder = &quot;0 &quot; ; XTIME:description = &quot;minutes since 2020-01-01 00:00:00&quot; ; XTIME:stagger = &quot;&quot; ; XTIME:units = &quot;minutes since 2020-01-01&quot; ; XTIME:calendar = &quot;proleptic_gregorian&quot; ; float XLAT(Time, south_north, west_east) ; XLAT:_FillValue = NaNf ; XLAT:FieldType = 104 ; XLAT:MemoryOrder = &quot;XY &quot; ; XLAT:description = &quot;LATITUDE, SOUTH IS NEGATIVE&quot; ; XLAT:units = &quot;degree_north&quot; ; XLAT:stagger = &quot;&quot; ; XLAT:coordinates = &quot;XLONG XLAT&quot; ; float XLONG(Time, south_north, west_east) ; XLONG:_FillValue = NaNf ; XLONG:FieldType = 104 ; XLONG:MemoryOrder = &quot;XY &quot; ; XLONG:description = &quot;LONGITUDE, WEST IS NEGATIVE&quot; ; XLONG:units = &quot;degree_east&quot; ; XLONG:stagger = &quot;&quot; ; XLONG:coordinates = &quot;XLONG XLAT&quot; ; data: } </code></pre> <p>I have two questions in mind:</p> <ol> <li><p>In the Python code, I didn't edit any setting or extracting in XTIME, but I have no idea why it appear in the netcdf file. In other words, I just extract 'Times' variable to reveal the datetime in string. Is there any method to drop XTIME when merging datasets If possible?</p> </li> <li><p>XLAT, XLONG for the latitude and longitude information, I hope they in 2D (south_north, west_east), without &quot;Time&quot; or &quot;ymd&quot; dimension, and change the order to the first two variables.</p> </li> </ol> https://stackoverflow.com/q/67963199 2 yacx https://stackoverflow.com/users/9356675 2021-06-13T22:53:01Z 2025-08-16T08:03:10Z <p>I have a grib file containing monthly precipitation and temperature from 1989 to 2018 (extracted from ERA5-Land).</p> <p>I need to have those data in a dataset format with 6 column : longitude, latitude, ID of the cell/point in the grib file, date, temperature and precipitation.</p> <p>I first imported the file using cfgrib. Here is what contains the xdata list after importation:</p> <pre><code>import cfgrib grib_data = cfgrib.open_datasets('\era5land_extract.grib') grib_data Out[6]: [&lt;xarray.Dataset&gt; Dimensions: (latitude: 781, longitude: 761, time: 372) Coordinates: number int32 0 * time (time) datetime64[ns] 1989-01-01 1989-02-01 ... 2019-12-01 step timedelta64[ns] 1 days surface float64 0.0 * latitude (latitude) float64 42.0 41.9 41.8 41.7 ... -35.8 -35.9 -36.0 * longitude (longitude) float64 -21.0 -20.9 -20.8 -20.7 ... 54.8 54.9 55.0 valid_time (time) datetime64[ns] ... Data variables: t2m (time, latitude, longitude) float32 ... Attributes: GRIB_edition: 1 GRIB_centre: ecmf GRIB_centreDescription: European Centre for Medium-Range Weather Forecasts GRIB_subCentre: 0 Conventions: CF-1.7 institution: European Centre for Medium-Range Weather Forecasts, &lt;xarray.Dataset&gt; Dimensions: (latitude: 781, longitude: 761, time: 156) Coordinates: number int32 0 * time (time) datetime64[ns] 1989-01-01 1989-02-01 ... 2001-12-01 step timedelta64[ns] 1 days surface float64 0.0 * latitude (latitude) float64 42.0 41.9 41.8 41.7 ... -35.8 -35.9 -36.0 * longitude (longitude) float64 -21.0 -20.9 -20.8 -20.7 ... 54.8 54.9 55.0 valid_time (time) datetime64[ns] ... Data variables: tp (time, latitude, longitude) float32 ... Attributes: GRIB_edition: 1 GRIB_centre: ecmf GRIB_centreDescription: European Centre for Medium-Range Weather Forecasts GRIB_subCentre: 0 Conventions: CF-1.7 institution: European Centre for Medium-Range Weather Forecasts, &lt;xarray.Dataset&gt; Dimensions: (latitude: 781, longitude: 761, time: 216) Coordinates: number int32 0 * time (time) datetime64[ns] 2002-01-01 2002-02-01 ... 2019-12-01 step timedelta64[ns] 1 days surface float64 0.0 * latitude (latitude) float64 42.0 41.9 41.8 41.7 ... -35.8 -35.9 -36.0 * longitude (longitude) float64 -21.0 -20.9 -20.8 -20.7 ... 54.8 54.9 55.0 valid_time (time) datetime64[ns] ... Data variables: tp (time, latitude, longitude) float32 ... Attributes: GRIB_edition: 1 GRIB_centre: ecmf GRIB_centreDescription: European Centre for Medium-Range Weather Forecasts GRIB_subCentre: 0 Conventions: CF-1.7 institution: European Centre for Medium-Range Weather Forecasts] </code></pre> <p>So the temperature variable is called &quot;t2m&quot; and the precipitation variable &quot;tp&quot;. Precipitaion variable is split in two xarrays but I don't understand why.</p> <p>How can I obtain the needed dataset from this please ?</p> <p>It's the first time I'm dealing with such data, and I'm really lost on how to proceed.</p> https://stackoverflow.com/q/78201513 0 blaylockbk https://stackoverflow.com/users/2383070 2024-03-21T16:33:17Z 2025-07-31T13:34:42Z <p>How can I drop all xarray Dataset coordinates and variables that don't have any dimensions? In this example, I want to drop <code>item</code> and <code>nothing</code> from this Dataset.</p> <pre class="lang-py prettyprint-override"><code>import xarray as xr ds = xr.Dataset( {&quot;temperature&quot;: ([&quot;x&quot;, &quot;y&quot;], [[1, 2, 3], [1, 2, 3], [1, 2, 3]]), &quot;nothing&quot;: 10}, coords={&quot;item&quot;: &quot;hi&quot;, &quot;x&quot;: [1, 2, 3], &quot;y&quot;: [1, 2, 3]}, ) ds </code></pre> <p><a href="https://i.sstatic.net/mfVkR.png" rel="nofollow noreferrer"><img src="https://i.sstatic.net/mfVkR.png" alt="enter image description here" /></a></p> <p>I could of course use <code>ds.drop_vars([&quot;item&quot;, &quot;nothing&quot;])</code>, but I was expecting a different method like <code>ds.drop_dims()</code> or <code>ds.squeeze()</code> could do this without me specifying which to drop.</p> <p>An alternative way to look at this question is, how can I select only coordaintes and variables where the dimension is <code>(x,y)</code>?</p> https://stackoverflow.com/q/79708778 0 Njoku Okechukwu Valentine https://stackoverflow.com/users/4382513 2025-07-21T08:30:19Z 2025-07-22T23:11:53Z <p>I had the task of analyzing a .nc file. I have not worked with it before, so I did some research online and found some codes. I was exploring the dataset. I tried:</p> <p>I am using Debian 12, Python3 and Jupyter notebook</p> <pre><code># import needed libraries import numpy as np import xarray as xr # import dataset file_path = 'dataset/air_quality.nc' xrds = xr.open_dataset(file_path) xrds </code></pre> <p>I saw it printed</p> <pre> Dimensions: Coordinates: (0) Data variables: (0) Indexes: (0) Attributes: (52) </pre> <p>Down the line, I ran the following code:</p> <pre><code>xrds.data_vars </code></pre> <p>Output:</p> <pre> Data variables: *empty* </pre> <p>Does this mean the dataset has no variables or values I can explore?</p> https://stackoverflow.com/q/57924900 4 Philipe Riskalla Leal https://stackoverflow.com/users/8678015 2019-09-13T13:49:53Z 2025-07-18T14:52:19Z <p>I am applying slicing and aggregation operations over Netcdf files in Python language. One of the solutions for working with this kind of file is to use the Xarray library.</p> <p>I am still new to the library functionalities, so I would like to know whether Xarray objects possess some method to check if a sliced DataSet/DataArray is empty or not, just like Pandas has (in the case of pandas, one can check if the dataframe/series is empty through the 'empty' method).</p> <p>The only solution I found was to always convert the Xarray Dataset/DataArray into a pandas Dataframe/Series, to then check if it is empty or not.</p> <p>Here is code snippet as example:</p> <pre><code>import xarray as xr path = 'my_path_to_my_netcdf_file.nc' Xarray_DataArray = xr.open_dataset(path) print(Xarray_DataArray) # this returns something like: # Dimensions: (lat: 600, lon: 672, time: 37) # Coordinates: # * lat (lat) float32 -3.9791672 -3.9375012 ... 20.9375 20.979166 # * lon (lon) float32 -60.979168 -60.9375 ... -33.0625 -33.020832 # * time (time) datetime64[ns] 2010-05-19 2010-05-20 ... 2010-06-24 # Data variables: # variable_name (time, lat, lon) float32 dask.array&lt;shape=(37, 600, 672), # chunksize=(37, 600, 672)&gt; # I normally use the 'sel' method to slice the xarray object, like below: Sliced_Xarray_DataArray = Xarray_DataArray.sel({'lat':slice(-10, -9), 'lon':slice(-170, -169) }) # but since, Xarray does not possess a proper way to check the slice, I usually have to do the following: if Sliced_Xarray_DataArray.to_dataframe().empty(): print('is empty. Nothing to aggregate') else: Aggregated_value = Aggregation_function(Sliced_Xarray_DataArray) print('continuing with the analysis') # ... continue </code></pre> <p>I would appreciate any suggestions.</p> <p>I thank you for your time, and I hope hearing from you soon.</p> <p>Sincerely yours,</p> <p>Philipe R. Leal</p> https://stackoverflow.com/q/79703876 0 Kieran Bartels https://stackoverflow.com/users/28437981 2025-07-16T18:36:57Z 2025-07-17T23:44:41Z <p>My overall goal is the set up a virtual dataset of <a href="https://registry.opendata.aws/nsf-ncar-era5/" rel="nofollow noreferrer">ERA5</a> data using <a href="https://icechunk.io/en/latest/" rel="nofollow noreferrer">Icechunk</a>. As a smaller test example, I'm trying to pull all the data located in the <code>194001</code> <a href="https://nsf-ncar-era5.s3.amazonaws.com/index.html#e5.oper.fc.sfc.minmax/194001/" rel="nofollow noreferrer">ERA5 folder</a>. I've been mostly able to follow the example located on the <a href="https://icechunk.io/en/latest/virtual/" rel="nofollow noreferrer">Virtual Dataset</a> page.</p> <p>My current issue is that half the ERA5 <code>.nc</code> files have <strong>30</strong> <code>forecast_initial_time</code> values and the other half have <strong>32</strong>. (I've attached an image of when I was successfully able to create the dataset using a different method so you can better understand the dataset's structure.) Below is the code I've been able to produce so far. All my packages and libraries are up to date and the newest versions.</p> <pre><code>// Access all s3 files in the 194001 folder fs = fsspec.filesystem('s3', anon=True) s3_files = fs.glob('nsf-ncar-era5/e5.oper.fc.sfc.minmax/194001/e5.oper.fc.sfc.minmax.*.nc') s3_files = sorted(['s3://' + f for f in s3_files]) // Create an array of virtual datasets manifest_mod.PosixPath = pathlib.Path reader_options = { &quot;storage_options&quot;: { &quot;anon&quot;: True, } } virtual_datasets = [ open_virtual_dataset( filepath=url, indexes={}, reader_options=reader_options, filetype='netCDF4', ) for url in s3_files ] // Concat all the datasets into one virtual_ds = xr.concat( virtual_datasets, dim='time', coords='minimal', compat='override', combine_attrs='override', ) </code></pre> <p>The <code>xr.concat</code> is where I get the following error:</p> <pre><code>AlignmentError: cannot reindex or align along dimension 'forecast_initial_time' because of conflicting dimension sizes: {32, 30} </code></pre> <p>I understand that the differing dimensions sizes of the <code>forecast_initial_time</code> coordinate is causing the issue, but I have no idea how to manipulate the data. Any help would be great!</p> <p><a href="https://i.sstatic.net/9WlesqKN.png" rel="nofollow noreferrer">Merged Datasets</a></p>