Accessing remote files with earthaccess¶
When we search for data using earthaccess we get back a list of results from NASA's Common Metadata Repository or CMR for short. These results contain all the information
we need to access the files represented by the metadata. earthaccess offers 2 access methods that operate with these results, the first method is the well known, download()
where we copy the results from their location to our local disk, if we are running the code in AWS say on a Jupyterhub the files will be copied to the local VM disk.
The other method is open(), earthaccess uses fsspec to open remote files as if they were local. open has advantages and some disadvantages that we must know before using it.
The main advantage for open() is that we don't have to download the file, we can stream it into memory however depending on how we do it we may run into network performance issues. Again, if we run the code next to the data this would be fast, if we do it locally in our laptopts it will be slow.
import earthaccess
auth = earthaccess.login()
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[1], line 1 ----> 1 import earthaccess 3 auth = earthaccess.login() File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1135/lib/python3.11/site-packages/earthaccess/__init__.py:26 24 from .auth import Auth 25 from .dmrpp_zarr import open_virtual_dataset, open_virtual_mfdataset ---> 26 from .icechunk import open_icechunk_from_url 27 from .kerchunk import consolidate_metadata 28 from .search import DataCollection, DataCollections, DataGranule, DataGranules File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1135/lib/python3.11/site-packages/earthaccess/icechunk.py:6 3 from urllib.parse import urlparse 5 import earthaccess ----> 6 import icechunk as ic 7 from icechunk import IcechunkStore, S3StaticCredentials, s3_storage 9 ######################## bunch of hardcoded things to revise later ################### 10 # As discussed in https://github.com/nsidc/earthaccess/pull/1135 this should be stored 11 # independent from the code. Will be implemented in a separate PR that needs to be 12 # merged before this one. ModuleNotFoundError: No module named 'icechunk'
results = earthaccess.search_data(
short_name="ATL06",
cloud_hosted=False,
temporal=("2019-01", "2019-02"),
polygon=[(-100, 40), (-110, 40), (-105, 38), (-100, 40)],
)
results[0]
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[2], line 1 ----> 1 results = earthaccess.search_data( 2 short_name="ATL06", 3 cloud_hosted=False, 4 temporal=("2019-01", "2019-02"), 5 polygon=[(-100, 40), (-110, 40), (-105, 38), (-100, 40)], 6 ) 7 results[0] NameError: name 'earthaccess' is not defined
nsidc_url = "https://n5eil01u.ecs.nsidc.org/DP7/ATLAS/ATL06.005/2019.02.21/ATL06_20190221121851_08410203_005_01.h5"
lpcloud_url = "https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/EMITL2ARFL.001/EMIT_L2A_RFL_001_20220903T163129_2224611_012/EMIT_L2A_RFL_001_20220903T163129_2224611_012.nc"
session = earthaccess.get_requests_https_session()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[3], line 4 1 nsidc_url = "https://n5eil01u.ecs.nsidc.org/DP7/ATLAS/ATL06.005/2019.02.21/ATL06_20190221121851_08410203_005_01.h5" 2 lpcloud_url = "https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/EMITL2ARFL.001/EMIT_L2A_RFL_001_20220903T163129_2224611_012/EMIT_L2A_RFL_001_20220903T163129_2224611_012.nc" ----> 4 session = earthaccess.get_requests_https_session() NameError: name 'earthaccess' is not defined
headers = {"Range": "bytes=0-100"}
r = session.get(lpcloud_url, headers=headers)
r
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[4], line 2 1 headers = {"Range": "bytes=0-100"} ----> 2 r = session.get(lpcloud_url, headers=headers) 3 r NameError: name 'session' is not defined
fs = earthaccess.get_fsspec_https_session()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[5], line 1 ----> 1 fs = earthaccess.get_fsspec_https_session() NameError: name 'earthaccess' is not defined
with fs.open(lpcloud_url) as f:
data = f.read(100)
data
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[6], line 1 ----> 1 with fs.open(lpcloud_url) as f: 2 data = f.read(100) 3 data NameError: name 'fs' is not defined
%%time
import xarray as xr
files = earthaccess.open(results[0:2])
ds = xr.open_dataset(files[0], group="/gt1r/land_ice_segments")
ds
CPU times: user 447 ms, sys: 55.8 ms, total: 503 ms Wall time: 403 ms
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[7], line 1 ----> 1 get_ipython().run_cell_magic('time', '', '\nimport xarray as xr\n\nfiles = earthaccess.open(results[0:2])\n\nds = xr.open_dataset(files[0], group="/gt1r/land_ice_segments")\nds\n') File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1135/lib/python3.11/site-packages/IPython/core/interactiveshell.py:2565, in InteractiveShell.run_cell_magic(self, magic_name, line, cell) 2563 with self.builtin_trap: 2564 args = (magic_arg_s, cell) -> 2565 result = fn(*args, **kwargs) 2567 # The code below prevents the output from being displayed 2568 # when using magics with decorator @output_can_be_silenced 2569 # when the last Python token in the expression is a ';'. 2570 if getattr(fn, magic.MAGIC_OUTPUT_CAN_BE_SILENCED, False): File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1135/lib/python3.11/site-packages/IPython/core/magics/execution.py:1452, in ExecutionMagics.time(self, line, cell, local_ns) 1450 if interrupt_occured: 1451 if exit_on_interrupt and captured_exception: -> 1452 raise captured_exception 1453 return 1454 return out File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/1135/lib/python3.11/site-packages/IPython/core/magics/execution.py:1416, in ExecutionMagics.time(self, line, cell, local_ns) 1414 st = clock2() 1415 try: -> 1416 exec(code, glob, local_ns) 1417 out = None 1418 # multi-line %%time case File <timed exec>:3 NameError: name 'earthaccess' is not defined