Commit c8f05dd8 authored by Henning Glawe's avatar Henning Glawe

synchronize '' with the version from encyclopedia-pre-processing'

3dbeda49095faf438840110e9b6e3fccd06a1aef -- Allowed the usage of schemes other than nmd:// in the JSON files. This allows testing with files generated on a local machine.
f689e878a4775b1c4a06a70e5aabbaf2d42be25c -- return code-specific section (named '^x_.*_section.*') as ArchiveSectionJSON
e24727f9a67462d59c5a580888edbc73ac131aeb -- comment on code-specific x_*_section
be56a3083cf035ea0d7d53946581ee1ac4c9e0b4 -- fix repeating-values issue in (instead of N actual values, N times the first value was returned when looking up indices)
ae7880e302553768a40fa16cef06b848ca33359d -- moved reading mainfile_uri to archiveiterator
75f571c6c99c5377da46b6f56e276689327dfa6e -- Added pid to mongodb. Fixed tests for db changes. Fixed schema and postprocessor for empty pids and repository download uris.
parent 818da844
......@@ -9,6 +9,12 @@ import h5py
import numpy as np
from abc import ABCMeta, abstractmethod
from io import open
import re
import logging
LOGGER = logging.getLogger(__name__)
class ArchiveSection(object):
......@@ -656,17 +662,16 @@ class ArchiveSectionHDF5(ArchiveSection):
# .format(child_path)
# )
index_rows = index_data[test_index]
# If the value can have multiple shapes, the values are split into
# different tables. For each table there is a local index in the
# second column of the index table that we must use.
data = []
for index_row in index_rows:
for row_i in test_index:
index_row = index_data[row_i]
if index_row.shape != (1,):
data_index = index_row[1]
data_index = test_index[0]
data_index = row_i
# The data name may depend on the shape, and if so, the
# shape is appended to the name as base64 fields
......@@ -691,7 +696,7 @@ class ArchiveSectionHDF5(ArchiveSection):
# If one object returned, remove the outermost list
if len(index_rows) == 1:
if len(test_index) == 1:
if data[0].shape == ():
data = np.array([data[0]])
......@@ -734,12 +739,7 @@ class ArchiveJSON(Archive):
# Get the repository name from mainFileUri
mainfile_uri = json_root["mainFileUri"]
if not mainfile_uri.startswith("nmd://"):
raise ValueError(
"The mainFileUri in the JSON Archive file '{}' is invalid."
repository_name = mainfile_uri[6:]
repository_name = mainfile_uri.split("://", 1)[1]
repository_name = repository_name.split("/", 1)[0]
root_section = {
......@@ -849,6 +849,9 @@ class ArchiveSectionJSON(ArchiveSection):
is_section = False
if path.startswith("section"):
is_section = True
elif re.match(r'^x_\S+_section', path):
# code-specific section
is_section = True
# If no index specified, try to get as concrete value or as a list of
# sections
