NEANIAS Gitlab

Commit bc0e1630 authored by Carlos H. Brandt's avatar Carlos H. Brandt
Browse files

Get dataset sizes script is working example

parent 9bf68beb
#!/usr/bin/env python
"""
This script calculates the size volume of PDS* datasets.
......@@ -13,28 +14,98 @@ information of size, for instance, is kept. Size values are given in Megabytes.
* PDS stands for Planetary Data Systems
"""
import itertools
import pandas
# Sizes are used in units of Megabytes
_factor = {
'Size': {
'K':0.001, # If value in Kilobytes, divide by K
'M':1, # If value in Megabytes, do nothing
'G':1000 # If value in Gigabytes, multiply by K
}
}
}
def url_table2df(url, match=None):
"""
Return a DF from first table of 'url' with files matching 'match'
"""
import pandas as pd
def url_exists(url):
import requests
request = requests.get(url)
if request.status_code == 200:
print('Web site {} exists'.format(url))
else:
print('Web site {} does not exist'.format(url))
url_exists(url)
try:
tabs = pandas.read_html(url, match=match)
tab = tabs[0]
# Remove columns and rows with invalid values
tab = tab.dropna(axis=1, how='all').dropna()
except:
tab = None
return tab
def read_url_table(url, match=None):
"""
Return 'url_table2df' output with "size" column in Megabytes
Convertion is used the module's constant '_factor'
"""
tab = url_table2df(url, match=match)
if tab is None:
return None
tabs = pd.read_html(url, match=match)
tab = tabs[0]
# Remove columns and rows with invalid values
tab = tab.dropna(axis=1, how='all').dropna()
# Transform file sizes to Megabytes
tab['Size'] = tab['Size'].apply(lambda s:s[:-1]*_factor[s[-1].upper()])
col = 'Size'
fx = _factor[col]
tab[col] = tab[col].apply( lambda s: s[:-1] * fx[s[-1].upper()] )
return tab
def build_urls(template, **kwargs):
"""
Return list of URLs built from 'template' and 'kwargs' combination
'template' is a string like:
```
https://pds-imaging.jpl.nasa.gov/data/mro/ctx/mrox_{i:04d}/data/
```
to which 'kwargs' would be something like:
```
{ 'i': [1, 2, 10, 1234] }
```
"""
def product_dict(**kwargs):
"""
From https://stackoverflow.com/a/5228294/687896
"""
keys = kwargs.keys()
vals = kwargs.values()
for instance in itertools.product(*vals):
yield dict(zip(keys, instance))
urls = [ template.format(**d) for d in list(product_dict(**kwargs)) ]
return urls
def read_urls(urls, match=None):
tabs = []
for url in urls:
tab = read_url_table(url, match=match)
tabs.append(tab)
return tabs
def run():
template = 'https://pds-imaging.jpl.nasa.gov/data/mro/ctx/mrox_{i:04d}/data/'
match = 'IMG'
urls = build_urls(template, i=range(10))
tabs = read_urls(urls, match)
print(tabs)
if __name__ == '__main__':
run()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment