diff --git a/common/python/nomadcore/nomad_query.py b/common/python/nomadcore/nomad_query.py index 956af3d06e5426fed2a6a2ccd0694cb8586cecea..64cde748d5278b1a249ef87507576db8fd93d33b 100644 --- a/common/python/nomadcore/nomad_query.py +++ b/common/python/nomadcore/nomad_query.py @@ -6,7 +6,7 @@ Benjamin Regler - Apache 2.0 License @license http://www.apache.org/licenses/LICENSE-2.0 @author Benjamin Regler -@version 1.0.0 +@version 2.0.0 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -31,13 +31,13 @@ import random if sys.version_info.major > 2: # For Python 3.0 and later - from urllib.parse import quote, unquote_plus + from urllib.parse import quote, unquote_plus, urlencode from urllib.request import urlopen, Request else: # Fall back to Python 2's urllib2 - from urllib import quote, unquote_plus + from urllib import quote, unquote_plus, urlencode from urllib2 import urlopen, Request - + class NomadQueryResult(object): """Nomad Query Result class @@ -47,48 +47,81 @@ class NomadQueryResult(object): def __init__(self, query, response, version=1.0): """Constructor. - + Arguments: query {dict} -- Query information, i.e., query filter, context, group_by, endpoint, and URL response {dict} -- Response of the Nomad Query API - + Keyword Arguments: version {number} -- Version of the Nomad Query data file (default: {1.0}) """ self._uri = [] + self._download_url = '' self._query = query or {} self._timestamp = int(time.time()) - self._response = response.get('result', {}) + + # Load response information + self._load(response, version) + + def _load(self, response, version): + """Load response information + + Arguments: + response {dict} -- Response of the Nomad Query API + version {float} -- Version of the Nomad Query data file + """ + # Set version of the Nomad Query data file self._version = version - # Construct download path - path = response.get('path', '') - self._download_url = self._query.get('endpoint', '') + 'download/' + \ - path.split('_')[-1] + '?file=' + quote(path.encode('utf-8')) + '.json' + # Initialize + if version == 1.0: + self._response = response.get('result', {}) - # Get Nomad URIs - response = NomadQuery().request(self._download_url) - if response['status'] == 'success': - regex = re.compile(r'(?<=/[a-zA-Z0-9\-_]{3}/)[^\.]+') - paths = response['data'].get('result', []) + # Construct download path + path = response.get('path', '') + self._download_url = self._query.get('endpoint', '') + \ + 'download/' + path.split('_')[-1] + '?file=' + \ + quote(path.encode('utf-8')) + '.json' - for path in paths: - match = regex.search(path) - if match: - # Substitute prefixes - groups = match.group(0).split('/') - groups[0] = 'N' + groups[0][1:] # Normalized + # Get Nomad URIs + response = NomadQuery.request(self._download_url) + if response['status'] == 'success': + regex = re.compile(r'(?<=/[a-zA-Z0-9\-_]{3}/)[^.]+') + paths = response['data'].get('result', []) - if len(groups) == 2: - groups[1] = 'C' + groups[1][1:] # Computed + for path in paths: + match = regex.search(path) + if match: + # Substitute prefixes + groups = match.group(0).split('/') + groups[0] = 'N' + groups[0][1:] # Normalized - self._uri.append('nmd://' + '/'.join(groups)) + if len(groups) == 2: + groups[1] = 'C' + groups[1][1:] # Computed + + self._uri.append('nmd://' + '/'.join(groups)) + + elif version == 2.0: + self._response = response.get('data', {}) + + # Construct and get Nomad URIs + for entry in self._response: + if not entry['type'].lower().endswith('calculation'): + continue + + # Get archive gid + context = entry['attributes']['metadata']['archive_context'] + gid = context['archive_gid'][0] + + # Assemble Nomad Uri + uri = 'nmd://N' + gid[1:] + '/' + entry['id'] + self._uri.append(uri) def version(self): """Get the version of the Nomad Query data file. - + Returns: float -- Version of the Nomad Query data file """ @@ -107,6 +140,10 @@ class NomadQueryResult(object): Returns: str -- The download URL of the query + + Deprecated: + Since version 2.0.0, this method is no longer used by internal code + and not recommended. """ return self._download_url @@ -142,23 +179,25 @@ class NomadQuery(object): """ # Version of the Nomad Query API - __version__ = 1.0 - - # Nomad API endpoint - endpoint = os.environ.get('NOMAD_BASE_URI','https://analytics-toolkit.nomad-coe.eu') + '/api/' + __version__ = 2.0 # Private user path user_path = '/data/private' - def __init__(self, username='', endpoint=''): + # Nomad API endpoints + endpoint = 'https://analytics-toolkit.nomad-coe.eu/api/' + query_endpoint = 'https://analytics-toolkit.nomad-coe.eu/archive/nql-api/' + + def __init__(self, username='', endpoint='', query_endpoint=''): """Constructor. Keyword Arguments: - username {str} -- Current username. Leave empty to auto-detect - username (default: {''}) - endpoint {str} -- Endpoint of the Nomad API (default: - ${NOMAD_BASE_URI}/api if set, otherwise - {'https://analytics-toolkit.nomad-coe.eu/api/'}) + username {str} -- Current username. Leave empty to auto-detect + username (default: {''}) + endpoint {str} -- Endpoint of the Nomad API (default: + {'https://analytics-toolkit.nomad-coe.eu/api/'}) + query_endpoint {str} -- Endpoint of the Nomad Query API (default: + {'https://analytics-toolkit.nomad-coe.eu/nql-api/'}) """ self._username = '' self._base_path = '' @@ -170,11 +209,14 @@ class NomadQuery(object): if len(paths) == 1 and paths[0].lower() != 'nomad': username = paths[0] - # Set username and overwrite endpoint, if required + # Set username and overwrite endpoints, if required self.username(username) if endpoint: self.endpoint = str(endpoint) + if query_endpoint: + self.query_endpoint = str(query_endpoint) + def username(self, username=''): """Get or set the username. @@ -190,62 +232,6 @@ class NomadQuery(object): 'nomad-query') return self._username - def request(self, url, timeout=10): - """Request a URL - - Arguments: - url {str} -- The URL of a web address - - Keyword Arguments: - timeout {number} -- Timeout of the request in seconds (default: {10}) - - Returns: - dict -- A dictionary with success status, response data, or - error message - """ - # Default request response - result = { - 'url': url, - 'status': 'error', - 'message': 'Unknown error. Please inform the Nomad team to ' - 'solve this problem.' - } - - try: - # Get URL - response = urlopen(Request(url), timeout=timeout) - if response.code != 200: - raise RuntimeError(result['message']) - - # Read response - data = json.loads(response.read().decode('utf-8'), 'utf-8') - - # Populate result - result.pop('message') - result.update({ - 'status': 'success', - 'data': data - }) - except Exception as exc: - exc = sys.exc_info()[1] - response = result.copy() - - # Get error message - message = exc - if sys.version_info <= (2, 5) and hasattr(exc, 'message'): - message = exc.message - elif hasattr(exc, 'reason'): - message = exc.reason - response['message'] = str(message) - - # Fix error message - if response['message'].endswith('timed out'): - response['message'] = 'Connection timed out. The Nomad ' + \ - 'Analytics API Service is currently unavailable.' - - # Return result - return result - def resolve(self, nmd, path='', recursive=True, timeout=10): """Resolve a Nomad URI. @@ -303,7 +289,7 @@ class NomadQuery(object): if not os.path.isdir(base_path): return queries - # Get all stored queries + # Get all stored queries for filename in os.listdir(base_path): path = os.path.join(base_path, filename) if os.path.isfile(path): @@ -322,17 +308,22 @@ class NomadQuery(object): queries.sort(key=lambda x: -x['timestamp']) return queries - def query(self, query, group_by='', context='', timeout=10): + def query(self, query, group_by='', timeout=10, **kwargs): """Query the Nomad Database. Arguments: query {str} -- The query string (see Nomad API reference) Keyword Arguments: - group_by {str} -- Group-by field. (default: {''}) - context {str} -- Query context. Leave empty to use - `single_configuration_calculation` (default: {''}) - timeout {number} -- Timeout of the request in seconds (default: {10}) + group_by {str} -- Group-by field. (default: {''}) + num_results {int} -- Number of calculations to return + (default: {10000}) + num_groups {int} -- Number of distinct calculation groups to return + (default: {10}) + context {str} -- Deprecated: Query context. Leave empty to use + `single_configuration_calculation` (default: {''}) + compat {bool} -- Compatibility mode (default: {True}) + timeout {number} -- Timeout of the request in seconds (default: {10}) Returns: NomadQueryResult -- The Nomad query result @@ -343,17 +334,27 @@ class NomadQuery(object): RuntimeError -- Unknown error. Please inform the Nomad team to solve this problem. """ - # Set default context - if not context: - context = 'single_configuration_calculation' - # Construct URL - url = self.endpoint + ('queryGroup/' if group_by else 'query/') + context + url = self.query_endpoint + ('search_grouped' if group_by else 'search') + params = { + 'source_fields': 'archive_gid', + 'sort_field': 'calculation_gid', + 'num_results': max(min(kwargs.get('num_results', 10000), 10000), 1), + 'format': 'nested' + } + + # Normalize query - compatibility fallback + if kwargs.get('compat', True): + query = self._normalize(query) # Add query - url += '?filter=' + quote(query.strip()) + params['query'] = query.strip() if group_by: - url += quote(' GROUPBY ' + group_by.strip().lower()) + params['group_by'] = group_by.strip().lower() + params['num_groups'] = max(kwargs.get('num_groups', 10), 1) + + # Construct URL + url += '?' + urlencode(params).replace('+', '%20') # Read URL response = self.request(url, timeout=timeout) @@ -362,21 +363,18 @@ class NomadQuery(object): # Check connection timeout response = response['data'] - if 'timed_out' in response['result'] and response['result']['timed_out']: + if response['meta'].get('is_timed_out', False) or \ + response['meta'].get('is_terminated_early', False): response['message'] = 'Connection timed out.' - # Check for additional error messages - if 'message' in response or 'msg' in response: - raise RuntimeError(response.get('message', response['msg'])) - # Construct Nomad Query response query = { - 'context': context, - 'endpoint': self.endpoint, - 'filter': query.strip(), - 'group_by': group_by.strip().lower(), + 'endpoint': self.query_endpoint, + 'query': params.get('query', ''), + 'group_by': params.get('group_by', ''), 'url': url } + return NomadQueryResult(query, response, self.__version__) def fetch(self, name_or_index='', resolve=False, **params): @@ -531,6 +529,97 @@ class NomadQuery(object): data['data'] = self._resolve(data['uri'], **params) return data + @staticmethod + def request(url, timeout=10): + """Request a URL + + Arguments: + url {str} -- The URL of a web address + + Keyword Arguments: + timeout {number} -- Timeout of the request in seconds (default: {10}) + + Returns: + dict -- A dictionary with success status, response data, or + error message + """ + # Default request response + result = { + 'url': url, + 'status': 'error', + 'message': 'Unknown error. Please inform the Nomad team to ' + 'solve this problem.' + } + + try: + # Get URL + response = urlopen(Request(url), timeout=timeout) + + # Check response code + if response.code != 200: + raise RuntimeError(result['message']) + + # Read response + data = json.loads(response.read().decode('utf-8'), 'utf-8') + + # Populate result + result.pop('message') + result.update({ + 'status': 'success', + 'data': data + }) + except Exception as exc: + exc = sys.exc_info()[1] + response = result.copy() + + # Get error message + message = exc + if sys.version_info <= (2, 5) and hasattr(exc, 'message'): + message = exc.message + elif hasattr(exc, 'reason'): + message = exc.reason + response['message'] = str(message) + + # Fix error message + if response['message'].endswith('timed out'): + response['message'] = 'Connection timed out. The Nomad ' + \ + 'Analytics API Service is currently unavailable.' + + # Return result + return result + + def _normalize(self, query): + """[Protected] Normalize query syntax + + Arguments: + query {str} -- The query string (see Nomad API reference) + + Returns: + str -- The normalized query string + """ + # Convert nomad query syntax v1 to v2 + if re.search(r'(?<!\\):', query): + values = re.split('\sand\s', query, 0, re.I) + + # Convert query + regex = re.compile(r'([^:]+):(.+)') + for i in range(len(values)): + match = regex.search(values[i]) + if match: + # Make sure strings are properly escaped + value = map(str.strip, match.group(2).split(',')) + value = ','.join((v if v.isdigit() + else '"' + v.strip('\'" ') + '"') + for v in value) + + # Replace colons with equal symbols + values[i] = match.group(1) + ' = ' + value + + # Rebuild query + query = ' AND '.join(values) + + return query + def _resolve(self, paths, size=None, seed=None, **params): """[Protected] Resolve Nomad URIs.