NEANIAS Gitlab
Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Carlos H. Brandt
DM
Commits
bc0e1630
Commit
bc0e1630
authored
Apr 23, 2020
by
Carlos H. Brandt
Browse files
Get dataset sizes script is working example
parent
9bf68beb
Changes
1
Hide whitespace changes
Inline
Side-by-side
datasets/code/get_datasets_sizes.py
100644 → 100755
View file @
bc0e1630
#!/usr/bin/env python
"""
This script calculates the size volume of PDS* datasets.
...
...
@@ -13,28 +14,98 @@ information of size, for instance, is kept. Size values are given in Megabytes.
* PDS stands for Planetary Data Systems
"""
import
itertools
import
pandas
# Sizes are used in units of Megabytes
_factor
=
{
'Size'
:
{
'K'
:
0.001
,
# If value in Kilobytes, divide by K
'M'
:
1
,
# If value in Megabytes, do nothing
'G'
:
1000
# If value in Gigabytes, multiply by K
}
}
}
def
url_table2df
(
url
,
match
=
None
):
"""
Return a DF from first table of 'url' with files matching 'match'
"""
import
pandas
as
pd
def
url_exists
(
url
):
import
requests
request
=
requests
.
get
(
url
)
if
request
.
status_code
==
200
:
print
(
'Web site {} exists'
.
format
(
url
))
else
:
print
(
'Web site {} does not exist'
.
format
(
url
))
url_exists
(
url
)
try
:
tabs
=
pandas
.
read_html
(
url
,
match
=
match
)
tab
=
tabs
[
0
]
# Remove columns and rows with invalid values
tab
=
tab
.
dropna
(
axis
=
1
,
how
=
'all'
).
dropna
()
except
:
tab
=
None
return
tab
def
read_url_table
(
url
,
match
=
None
):
"""
Return 'url_table2df' output with "size" column in Megabytes
Convertion is used the module's constant '_factor'
"""
tab
=
url_table2df
(
url
,
match
=
match
)
if
tab
is
None
:
return
None
tabs
=
pd
.
read_html
(
url
,
match
=
match
)
tab
=
tabs
[
0
]
# Remove columns and rows with invalid values
tab
=
tab
.
dropna
(
axis
=
1
,
how
=
'all'
).
dropna
()
# Transform file sizes to Megabytes
tab
[
'Size'
]
=
tab
[
'Size'
].
apply
(
lambda
s
:
s
[:
-
1
]
*
_factor
[
s
[
-
1
].
upper
()])
col
=
'Size'
fx
=
_factor
[
col
]
tab
[
col
]
=
tab
[
col
].
apply
(
lambda
s
:
s
[:
-
1
]
*
fx
[
s
[
-
1
].
upper
()]
)
return
tab
def
build_urls
(
template
,
**
kwargs
):
"""
Return list of URLs built from 'template' and 'kwargs' combination
'template' is a string like:
```
https://pds-imaging.jpl.nasa.gov/data/mro/ctx/mrox_{i:04d}/data/
```
to which 'kwargs' would be something like:
```
{ 'i': [1, 2, 10, 1234] }
```
"""
def
product_dict
(
**
kwargs
):
"""
From https://stackoverflow.com/a/5228294/687896
"""
keys
=
kwargs
.
keys
()
vals
=
kwargs
.
values
()
for
instance
in
itertools
.
product
(
*
vals
):
yield
dict
(
zip
(
keys
,
instance
))
urls
=
[
template
.
format
(
**
d
)
for
d
in
list
(
product_dict
(
**
kwargs
))
]
return
urls
def
read_urls
(
urls
,
match
=
None
):
tabs
=
[]
for
url
in
urls
:
tab
=
read_url_table
(
url
,
match
=
match
)
tabs
.
append
(
tab
)
return
tabs
def
run
():
template
=
'https://pds-imaging.jpl.nasa.gov/data/mro/ctx/mrox_{i:04d}/data/'
match
=
'IMG'
urls
=
build_urls
(
template
,
i
=
range
(
10
))
tabs
=
read_urls
(
urls
,
match
)
print
(
tabs
)
if
__name__
==
'__main__'
:
run
()
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment