NEANIAS Gitlab

Commit f8d92284 authored by Carlos H. Brandt's avatar Carlos H. Brandt
Browse files

Upload notebook/env example for 'get_datasets_sizes'

parent 0bba3fb3
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Compute PDS data volumes"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import datasets_parse_config as dc\n",
"\n",
"archives = dc.archives\n",
"\n",
"archives"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import get_datasets_sizes as get_sizes\n",
"\n",
"# If you want to run over all of them, uncomment the following\n",
"# (this is just a copy-n-paste from `get_datasets_sizes.__main__` block)\n",
"\n",
"# for dset in archives.keys():\n",
"# pars = archives[dset]\n",
"#\n",
"# print('Running {}'.format(dset))\n",
"# tabs = get_sizes.run(**pars)\n",
"#\n",
"# total = 0\n",
"# for tab in tabs:\n",
"# if tab is None:\n",
"# continue\n",
"# size = tab['Size'].sum()\n",
"# total += size\n",
"# print('Partial sizes in {!s}: {:d} MB'.format(dset,size))\n",
"# print('Total size of {!s}: {:.2f} GB'.format(dset,float(total/1000)))\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Let's test with our/my beloved CTX\n",
"\n",
"# CTX\n",
"ctx_pars = archives['ctx']"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cannot parse from 'NoneType'\n",
"URL https://pds-imaging.jpl.nasa.gov/data/mro/ctx/mrox_0000/data/ has no table\n",
"Writing tab from https://pds-imaging.jpl.nasa.gov/data/mro/ctx/mrox_0001/data/\n",
"Writing tab from https://pds-imaging.jpl.nasa.gov/data/mro/ctx/mrox_0002/data/\n",
"Writing tab from https://pds-imaging.jpl.nasa.gov/data/mro/ctx/mrox_0003/data/\n",
"Writing tab from https://pds-imaging.jpl.nasa.gov/data/mro/ctx/mrox_0004/data/\n"
]
}
],
"source": [
"tabs = get_sizes.run(**ctx_pars)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> Clearly the output/user feedback is sh*t. Apparently something was written down (I actually know that, but...anyhow...let me figure it out)."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"dset_tabs/data/mro/ctx/mrox_0001/data:\r\n",
"indexdf.csv\r\n",
"\r\n",
"dset_tabs/data/mro/ctx/mrox_0002/data:\r\n",
"indexdf.csv\r\n",
"\r\n",
"dset_tabs/data/mro/ctx/mrox_0003/data:\r\n",
"indexdf.csv\r\n",
"\r\n",
"dset_tabs/data/mro/ctx/mrox_0004/data:\r\n",
"indexdf.csv\r\n"
]
}
],
"source": [
"ls dset_tabs/data/mro/ctx/*/data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**The reason why we have only 4 sets of CTX is because of** (tests):"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'i': range(0, 5)}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ctx_pars['kwargs']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"So...we have to walk through these files and read/reduce them. Will define a function to walk through and concatenate to a global DF"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
",Name,Last modified,Size\r\n",
"1,CRU_000001_9999_XN_99N999W.IMG,2007-05-17 11:23,4\r\n",
"2,CRU_000002_9999_XN_99N999W.IMG,2007-05-17 11:24,74\r\n",
"3,CRU_000003_9999_XN_99N999W.IMG,2007-05-17 11:24,9\r\n",
"4,CRU_000004_9999_XN_99N999W.IMG,2007-05-17 11:25,9\r\n",
"5,CRU_000005_9999_XN_99N999W.IMG,2007-05-17 11:26,104\r\n",
"6,CRU_000006_9999_XN_99N999W.IMG,2007-05-17 11:26,74\r\n",
"7,CRU_000007_9999_XN_99N999W.IMG,2007-05-17 11:27,9\r\n",
"8,CRU_000008_9999_XN_99N999W.IMG,2007-05-17 11:27,9\r\n",
"9,CRU_000009_9999_XN_99N999W.IMG,2007-05-17 11:27,104\r\n",
"10,CRU_000010_9999_XN_99N999W.IMG,2007-05-17 11:28,74\r\n",
"11,CRU_000011_9999_XN_99N999W.IMG,2007-05-17 11:28,9\r\n",
"12,CRU_000012_9999_XN_99N999W.IMG,2007-05-17 11:28,49\r\n",
"13,CRU_000013_9999_XN_99N999W.IMG,2007-05-17 11:29,95\r\n",
"14,CRU_000014_9999_XN_99N999W.IMG,2007-05-17 11:29,69\r\n",
"15,CRU_000015_9999_XN_99N999W.IMG,2007-05-17 11:30,138\r\n"
]
}
],
"source": [
"cat dset_tabs/data/mro/ctx/mrox_0001/data/indexdf.csv"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"%%sh\n",
"\n",
"ALL_INDXS=\"dset_tabs/data/mro/ctx/indexdf_all.csv\"\n",
"rm $ALL_INDXS\n",
"\n",
"find dset_tabs/data -name indexdf.csv -exec bash -c \"tail -n +2 {} >> $ALL_INDXS\" \\;"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# %%sh\n",
"\n",
"# cat dset_tabs/data/mro/ctx/indexdf_all.csv"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Volume of this CTX sample: 8416 MB\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.read_csv('dset_tabs/data/mro/ctx/indexdf_all.csv', names='Index,Name,Last modified,Size'.split(',')\n",
" ,index_col=0).set_index('Name', drop=True)\n",
"\n",
"print(f\"Volume of this CTX sample: {df['Size'].sum()} MB\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
name: jupyter
channels:
- conda-forge
- defaults
dependencies:
- alembic=1.4.3
- appnope=0.1.0
- argon2-cffi=20.1.0
- async_generator=1.10
- attrs=20.2.0
- backports=1.0
- backports.functools_lru_cache=1.6.1
- bleach=3.1.5
- blinker=1.4
- branca=0.3.1
- brotlipy=0.7.0
- c-ares=1.16.1
- ca-certificates=2020.6.20
- certifi=2020.6.20
- certipy=0.1.3
- cffi=1.14.1
- chardet=3.0.4
- configurable-http-proxy=4.2.1
- cryptography=3.1
- cycler=0.10.0
- decorator=4.4.2
- defusedxml=0.6.0
- entrypoints=0.3
- freetype=2.10.3
- icu=64.2
- idna=2.10
- importlib-metadata=1.7.0
- importlib_metadata=1.7.0
- ipykernel=5.3.4
- ipyleaflet=0.13.3
- ipython=5.8.0
- ipython_genutils=0.2.0
- ipywidgets=7.5.1
- jinja2=2.11.2
- jpeg=9d
- json5=0.9.4
- jsonschema=3.2.0
- jupyter_client=6.1.7
- jupyter_core=4.6.3
- jupyter_telemetry=0.0.5
- jupyterhub=1.1.0
- jupyterhub-base=1.1.0
- jupyterlab=2.2.7
- jupyterlab_pygments=0.1.1
- jupyterlab_server=1.2.0
- kiwisolver=1.2.0
- krb5=1.17.1
- lcms2=2.11
- libblas=3.9.0
- libcblas=3.9.0
- libcurl=7.71.1
- libcxx=10.0.1
- libedit=3.1.20191231
- libev=4.33
- libffi=3.2.1
- libgfortran=5.0.0
- libgfortran5=9.3.0
- liblapack=3.9.0
- libnghttp2=1.41.0
- libpng=1.6.37
- libsodium=1.0.18
- libssh2=1.9.0
- libtiff=4.1.0
- libuv=1.39.0
- libwebp-base=1.1.0
- llvm-openmp=11.0.0
- lz4-c=1.9.2
- mako=1.1.3
- markupsafe=1.1.1
- matplotlib=3.3.2
- matplotlib-base=3.3.2
- mistune=0.8.4
- nbclient=0.5.0
- nbconvert=6.0.1
- nbformat=5.0.7
- ncurses=6.2
- nest-asyncio=1.4.0
- nodejs=12.18.3
- notebook=6.1.4
- numpy=1.19.2
- oauthlib=3.0.1
- olefile=0.46
- openssl=1.1.1h
- packaging=20.4
- pamela=1.0.0
- pandas=1.1.3
- pandoc=2.10.1
- pandocfilters=1.4.2
- pexpect=4.8.0
- pickleshare=0.7.5
- pillow=8.0.0
- pip=20.2.3
- prometheus_client=0.8.0
- prompt_toolkit=1.0.15
- ptyprocess=0.6.0
- pycparser=2.20
- pycurl=7.43.0.5
- pygments=2.6.1
- pyjwt=1.7.1
- pyopenssl=19.1.0
- pyparsing=2.4.7
- pyrsistent=0.17.2
- pysocks=1.7.1
- python=3.7.8
- python-dateutil=2.8.1
- python-editor=1.0.4
- python-json-logger=0.1.11
- python_abi=3.7
- pytz=2020.1
- pyzmq=19.0.2
- readline=8.0
- requests=2.24.0
- ruamel.yaml=0.16.12
- ruamel.yaml.clib=0.2.2
- send2trash=1.5.0
- setuptools=49.6.0
- simplegeneric=0.8.1
- six=1.15.0
- sqlalchemy=1.3.19
- sqlite=3.33.0
- terminado=0.8.3
- testpath=0.4.4
- tk=8.6.10
- tornado=6.0.4
- traitlets=4.3.3
- traittypes=0.2.1
- urllib3=1.25.10
- wcwidth=0.2.5
- webencodings=0.5.1
- wheel=0.35.1
- widgetsnbextension=3.5.1
- xz=5.2.5
- zeromq=4.3.2
- zipp=3.1.0
- zlib=1.2.11
- zstd=1.4.5
- pip:
- lxml==4.6.2
prefix: /opt/miniconda3/envs/jupyter
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment