NEANIAS Gitlab

Commit ff19804c authored by Carlos H. Brandt's avatar Carlos H. Brandt
Browse files

Split html parsing from http request

parent 6d67385b
......@@ -44,12 +44,13 @@ archives = {
'match': 'img'
},
# 'hirise': {
# 'template': 'https://hirise-pds.lpl.arizona.edu/PDS/RDR/{dl}/{orb}/{dlorb}',
# 'kwargs': {
# 'dl': ['ESP','PSP'],
# 'orb': list_dirs('*'),
# 'dlorb': list_dirs('*')
# }
# }
'hirise': {
'template': 'https://hirise-pds.lpl.arizona.edu/PDS/RDR/{dl}/{orb}/{dlorb}',
'kwargs': {
'dl': ['ESP','PSP'],
'orb': '*',
'dlorb': '*'
}
#'<a href=.*>(?P<name>{match}?)'+ '\s*'+ '[0-9].*[ap]m' + '\s*(?P<size>[0-9].*?)\s*'
}
}
......@@ -59,83 +59,94 @@ def build_urls(template, **kwargs):
# - remove repeated '/' in case a kwarg is an empty string list ([""])
return urls
def http_table2df(url, match=None, parser=None):
def html_table2df(html, match=None, parser=None):
"""
Return a DF from first table of 'url' with files matching 'match'.
'parser' is actually not being used yet, may provide extra/pandas options.
"""
def url_exists(url):
request = requests.get(url)
if request.status_code != 200:
print("Web site '{}' not accessible".format(url))
else:
print("Web site '{}' OK".format(url))
return request.status_code
if not url_exists(url):
return None
try:
tabs = pandas.read_html(url, match=match, attrs=parser)
tabs = pandas.read_html(html, match=match, attrs=parser)
tab = tabs[0]
# Remove columns and rows with invalid values
tab = tab.dropna(axis=1, how='all').dropna()
except Exception as e:
print(e)
tab = None
return tab
# alias for deprecation
url_table2df = http_table2df
def http_regex2df(url, match, parser):
"""
Parse a '<pre/>' defined table in an HTML document (from 'url').
'match' is used to filter the lines. In '<pre/>' pages, line breaking
<br> is used to break the document/string into lines.
'parser' is expected to provide a regex with 'name' and 'size' named
groups in it: `(?P<name>...)`, `(?P<size>...)`, ...
You can use 'match' inside 'parser' with a (?P<match>...) placeholder
"""
def html_regex2df(html, match, parser):
def crop_html_pre(html):
pattern = '<pre>(.*?)</pre>'
return re.search(pattern, html).group(1)
def filter_matching(pretext, pattern):
entries = pretext.split('<br>')
return [v for v in entries if re.search(pattern, v.lower())]
if pattern:
return [v for v in entries if re.search(pattern, v.lower())]
return entries
def parse_listing(entries, pattern):
out = []
for row in entries:
match = re.search(pattern, row.lower().strip())
filename = match.group('name')
filesize = match.group('size')
out.append((filename,filesize))
try:
match = re.search(pattern, row.lower().strip())
filename = match.group('name')
filesize = match.group('size')
out.append((filename,filesize))
except Exception as e:
pass
return out
html = requests.get(url).text
try:
pre = crop_html_pre(html)
except:
except Exception as e:
print(e)
return None
print(pre)
fls = filter_matching(pre, match)
print('FILES',fls)
try:
files_size = parse_listing(fls, parser.format(match=match))
except:
pattern = parser % {match:match} #.format(match=match)
print(pattern)
files_size = parse_listing(fls, pattern)
except Exception as e:
print(e)
return None
tab = pandas.DataFrame(files_size,columns=('Name','Size'))
return pandas.DataFrame(files_size,columns=('Name','Size'))
return tab
url_regex2df = http_regex2df
def get_index(url):
request = requests.get(url)
if request.status_code != 200:
return None
return request.text
# def http_table2df(url, match=None, parser=None):
# """
# Return a DF from first table of 'url' with files matching 'match'.
# """
# html = get_index(url)
# if html is None:
# return None
#
# return html_table2df(html, match, parser)
#
# def http_regex2df(url, match, parser):
# """
# Parse a '<pre/>' defined table in an HTML document (from 'url').
# 'match' is used to filter the lines. In '<pre/>' pages, line breaking
# <br> is used to break the document/string into lines.
# 'parser' is expected to provide a regex with 'name' and 'size' named
# groups in it: `(?P<name>...)`, `(?P<size>...)`, ...
#
# You can use 'match' inside 'parser' with a (?P<match>...) placeholder
# """
# html = get_index(url)
# if html is None:
# return None
#
# return html_regex2df(html, match, parser)
def read_http_table(url, match=None, parser=None):
"""
......@@ -143,10 +154,11 @@ def read_http_table(url, match=None, parser=None):
Convertion is used the module's constant '_factor'
"""
html = get_index(url)
if not parser or 'pandas' in parser:
tab = url_table2df(url, match=match, parser=parser['pandas'])
tab = html_table2df(html, match=match, parser=parser['pandas'])
else:
tab = url_regex2df(url, match, parser=parser['re'])
tab = html_regex2df(html, match, parser=parser['re'])
if tab is None:
return None
......@@ -163,9 +175,6 @@ def read_http_table(url, match=None, parser=None):
print(e)
return tab
# alias for deprecation
read_url_table = read_http_table
def read_ftp_table(url, match=None):
from ftplib import FTP
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment