From a6456e2df2a9f6a25f7a7cfb248d9be7198956f1 Mon Sep 17 00:00:00 2001
From: Markus Scheidgen <markus.scheidgen@gmail.com>
Date: Fri, 19 Jul 2019 11:47:19 +0200
Subject: [PATCH] Improved the auxfile handling. Some fixes to rawfile api.
 Completes implementation of #171.

---
 gui/src/components/uploads/Upload.js   |  20 ++++++--
 nomad/api/raw.py                       |  14 ++++--
 nomad/config.py                        |   2 +-
 nomad/datamodel/base.py                |   1 -
 nomad/files.py                         |  33 ++++++++-----
 nomad/processing/data.py               |  66 +++++++++++++------------
 tests/data/proc/examples_large_dir.zip | Bin 0 -> 27623 bytes
 tests/processing/test_data.py          |   8 +++
 tests/test_api.py                      |   1 +
 9 files changed, 91 insertions(+), 54 deletions(-)
 create mode 100644 tests/data/proc/examples_large_dir.zip

diff --git a/gui/src/components/uploads/Upload.js b/gui/src/components/uploads/Upload.js
index 7f09f82525..e944f0bf4f 100644
--- a/gui/src/components/uploads/Upload.js
+++ b/gui/src/components/uploads/Upload.js
@@ -354,7 +354,20 @@ class Upload extends React.Component {
 
     const renderRow = (calc, index) => {
       const { mainfile, upload_id, calc_id, parser, tasks, current_task, tasks_status, errors } = calc
-      const color = tasks_status === 'FAILURE' ? 'error' : 'default'
+      let tooltip_html = null
+      let color = tasks_status === 'FAILURE' ? 'error' : 'default'
+      if (tasks_status === 'FAILURE') {
+        tooltip_html = `Calculation processing failed with errors: ${errors.join(', ')}`
+        color = 'error'
+      }
+      if (calc.errors.length > 0) {
+        color = 'error'
+        tooltip_html = `Calculation processed with errors: ${calc.errors.join(', ')}`
+      }
+      if (calc.warnings.length > 0) {
+        color = 'error'
+        tooltip_html = `Calculation processed with warnings: ${calc.warnings.join(', ')}`
+      }
       const processed = tasks_status === 'FAILURE' || tasks_status === 'SUCCESS'
       const row = (
         <TableRow key={index} hover={processed}
@@ -391,10 +404,9 @@ class Upload extends React.Component {
         </TableRow>
       )
 
-      if (tasks_status === 'FAILURE') {
-        const error_html = `Calculation processing failed with errors: ${errors.join(', ')}`
+      if (tooltip_html) {
         return (
-          <Tooltip key={calc_id} title={error_html}>
+          <Tooltip key={calc_id} title={tooltip_html}>
             {row}
           </Tooltip>
         )
diff --git a/nomad/api/raw.py b/nomad/api/raw.py
index 145170dd6b..d7930b72cc 100644
--- a/nomad/api/raw.py
+++ b/nomad/api/raw.py
@@ -100,7 +100,8 @@ class FileView:
         return self.f.read(size)
 
 
-def get_raw_file_from_upload_path(upload_files, upload_filepath, authorization_predicate):
+def get_raw_file_from_upload_path(
+        upload_files, upload_filepath, authorization_predicate, mainfile: str = None):
     """
     Helper method used by func:`RawFileFromUploadPathResource.get` and
     func:`RawFileFromCalcPathResource.get`.
@@ -148,11 +149,14 @@ def get_raw_file_from_upload_path(upload_files, upload_filepath, authorization_p
         directory_files = upload_files.raw_file_list(upload_filepath)
         if len(directory_files) == 0:
             abort(404, message='There is nothing to be found at %s.' % upload_filepath)
+
+        contents = sorted([dict(name=name, size=size) for name, size in directory_files], key=lambda x: '' if x['name'] == mainfile else x['name'])
+        # if mainfile is not None:
+        #     contents = [mainfile] + [content for content in contents if content['name'] != mainfile]
         return {
             'upload_id': upload_files.upload_id,
             'directory': upload_filepath,
-            'contents': [
-                dict(name=name, size=size) for name, size in directory_files]
+            'contents': contents
         }, 200
 
 
@@ -252,7 +256,9 @@ class RawFileFromCalcPathResource(Resource):
             abort(404, message='The calc with id %s is not part of the upload with id %s.' % (calc_id, upload_id))
 
         upload_filepath = os.path.join(os.path.dirname(calc.mainfile), calc_filepath)
-        return get_raw_file_from_upload_path(upload_files, upload_filepath, authorization_predicate)
+        return get_raw_file_from_upload_path(
+            upload_files, upload_filepath, authorization_predicate,
+            mainfile=os.path.basename(calc.mainfile))
 
 
 @ns.route('/calc/<string:upload_id>/<string:calc_id>/')
diff --git a/nomad/config.py b/nomad/config.py
index 77df549174..ac13f62664 100644
--- a/nomad/config.py
+++ b/nomad/config.py
@@ -176,7 +176,7 @@ commit = gitinfo.commit
 release = 'devel'
 domain = 'DFT'
 service = 'unknown nomad service'
-auxfile_cutoff = 30
+auxfile_cutoff = 100
 console_log_level = logging.WARNING
 
 
diff --git a/nomad/datamodel/base.py b/nomad/datamodel/base.py
index b8f49a7740..deae57fecb 100644
--- a/nomad/datamodel/base.py
+++ b/nomad/datamodel/base.py
@@ -85,7 +85,6 @@ class CalcWithMetadata():
         # basic upload and processing related metadata
         self.upload_time: datetime.datetime = None
         self.files: List[str] = None
-        self.file_sizes: List[int] = None
         self.uploader: utils.POPO = None
         self.processed: bool = False
         self.last_processing: datetime.datetime = None
diff --git a/nomad/files.py b/nomad/files.py
index 71e1b2e0a7..01e250e23f 100644
--- a/nomad/files.py
+++ b/nomad/files.py
@@ -468,9 +468,11 @@ class StagingUploadFiles(UploadFiles):
             if not calc.with_embargo:
                 mainfile = calc.mainfile
                 assert mainfile is not None
-                for filepath in self.calc_files(mainfile):
-                    if not always_restricted(filepath):
-                        public_files[filepath] = None
+                # mainfile might already have been added due to being a auxfile to another calc
+                if mainfile not in public_files:
+                    for filepath in self.calc_files(mainfile, with_cutoff=False):
+                        if not always_restricted(filepath):
+                            public_files[filepath] = None
         # 1.2 remove the non public mainfiles that have been added as auxfiles of public mainfiles
         for calc in upload.calcs:
             if calc.with_embargo:
@@ -542,7 +544,7 @@ class StagingUploadFiles(UploadFiles):
 
         return results
 
-    def calc_files(self, mainfile: str, with_mainfile: bool = True) -> Iterable[str]:
+    def calc_files(self, mainfile: str, with_mainfile: bool = True, with_cutoff: bool = True) -> Iterable[str]:
         """
         Returns all the auxfiles and mainfile for a given mainfile. This implements
         nomad's logic about what is part of a calculation and what not. The mainfile
@@ -559,16 +561,21 @@ class StagingUploadFiles(UploadFiles):
         calc_dir = os.path.dirname(mainfile_object.os_path)
         calc_relative_dir = calc_dir[len(self._raw_dir.os_path) + 1:]
 
-        ls = os.listdir(calc_dir)
-        if len(ls) > config.auxfile_cutoff:
-            # If there are two many of them, its probably just a directory with lots of
-            # calculations. In this case it does not make any sense to provide thousands of
-            # aux files.
-            return [mainfile] if with_mainfile else []
+        file_count = 0
+        aux_files: List[str] = []
+        for filename in os.listdir(calc_dir):
+            if filename != mainfile_basename and os.path.isfile(os.path.join(calc_dir, filename)):
+                aux_files.append(os.path.join(calc_relative_dir, filename))
+                file_count += 1
+
+            if with_cutoff and file_count > config.auxfile_cutoff:
+                # If there are two many of them, its probably just a directory with lots of
+                # calculations. In this case it does not make any sense to provide thousands of
+                # aux files.
+                break
+
+        aux_files = sorted(aux_files)
 
-        aux_files = sorted(
-            os.path.join(calc_relative_dir, path) for path in ls
-            if os.path.isfile(os.path.join(calc_dir, path)) and path != mainfile_basename)
         if with_mainfile:
             return [mainfile] + aux_files
         else:
diff --git a/nomad/processing/data.py b/nomad/processing/data.py
index d54cd60763..53ae8f660e 100644
--- a/nomad/processing/data.py
+++ b/nomad/processing/data.py
@@ -164,9 +164,13 @@ class Calc(Proc):
             calc_with_metadata.nomad_commit = config.commit
             calc_with_metadata.last_processing = datetime.now()
             calc_with_metadata.files = self.upload_files.calc_files(self.mainfile)
-            self.preprocess_files(calc_with_metadata.files)
             self.metadata = calc_with_metadata.to_dict()
 
+            if len(calc_with_metadata.files) >= config.auxfile_cutoff:
+                self.warning(
+                    'This calc has many aux files in its directory. '
+                    'Have you placed many calculations in the same directory?')
+
             self.parsing()
             self.normalizing()
             self.archiving()
@@ -215,36 +219,6 @@ class Calc(Proc):
             self.upload.reload()
             self.upload.check_join()
 
-    def preprocess_files(self, filepaths):
-        for path in filepaths:
-            if os.path.basename(path) == 'POTCAR':
-                # create checksum
-                hash = hashlib.sha224()
-                with open(self.upload_files.raw_file_object(path).os_path, 'rb') as f:
-                    for line in f.readlines():
-                        hash.update(line)
-
-                checksum = hash.hexdigest()
-
-                # created stripped POTCAR
-                stripped_path = path + '.stripped'
-                with open(self.upload_files.raw_file_object(stripped_path).os_path, 'wt') as f:
-                    f.write('Stripped POTCAR file. Checksum of original file (sha224): %s\n' % checksum)
-                os.system(
-                    '''
-                        awk < %s >> %s '
-                        BEGIN { dump=1 }
-                        /End of Dataset/ { dump=1 }
-                        dump==1 { print }
-                        /END of PSCTR/ { dump=0 }'
-                    ''' % (
-                        self.upload_files.raw_file_object(path).os_path,
-                        self.upload_files.raw_file_object(stripped_path).os_path))
-
-                filepaths.append(stripped_path)
-
-        return filepaths
-
     @task
     def parsing(self):
         context = dict(parser=self.parser, step=self.parser)
@@ -652,6 +626,35 @@ class Upload(Proc):
             self.fail('bad .zip/.tar file', log_level=logging.INFO)
             return
 
+    def _preprocess_files(self, path):
+        """
+        Some files need preprocessing. Currently we need to add a stripped POTCAR version
+        and always restrict/embargo the original.
+        """
+        if os.path.basename(path) == 'POTCAR':
+            # create checksum
+            hash = hashlib.sha224()
+            with open(self.staging_upload_files.raw_file_object(path).os_path, 'rb') as orig_f:
+                for line in orig_f.readlines():
+                    hash.update(line)
+
+            checksum = hash.hexdigest()
+
+            # created stripped POTCAR
+            stripped_path = path + '.stripped'
+            with open(self.staging_upload_files.raw_file_object(stripped_path).os_path, 'wt') as stripped_f:
+                stripped_f.write('Stripped POTCAR file. Checksum of original file (sha224): %s\n' % checksum)
+            os.system(
+                '''
+                    awk < %s >> %s '
+                    BEGIN { dump=1 }
+                    /End of Dataset/ { dump=1 }
+                    dump==1 { print }
+                    /END of PSCTR/ { dump=0 }'
+                ''' % (
+                    self.staging_upload_files.raw_file_object(path).os_path,
+                    self.staging_upload_files.raw_file_object(stripped_path).os_path))
+
     def match_mainfiles(self) -> Generator[Tuple[str, object], None, None]:
         """
         Generator function that matches all files in the upload to all parsers to
@@ -663,6 +666,7 @@ class Upload(Proc):
         directories_with_match: Dict[str, str] = dict()
         upload_files = self.staging_upload_files
         for filename in upload_files.raw_file_manifest():
+            self._preprocess_files(filename)
             try:
                 parser = match_parser(filename, upload_files)
                 if parser is not None:
diff --git a/tests/data/proc/examples_large_dir.zip b/tests/data/proc/examples_large_dir.zip
new file mode 100644
index 0000000000000000000000000000000000000000..dbf3663359d8e8ec414bcf434d776ae5d8b43fd2
GIT binary patch
literal 27623
zcmWIWW@h1H00EZZ&wd~ph9wwe7*Z<|a|?1(i{o<=i_%l$Q!<P64fPUBD?&p!8JN}U
z3}ZpKw1S&~k>x8R0|Sc)0|NtY6D4t&Xh6P^Qn-vnn5j``7z@HgI}@jw2(N?817XU|
zG=lpbWF81pZl*B}%rv2anWi)_(~JgYn$y5c3mTYdNxqr5vL+(QgK|0uQ=B{vX^=Gy
z5$*(KArPkAoixarhBU~UhBU~UhBU~UhBU~UhBU~UhBU~UhBU~Uh~x>%mLN=Vk~gA3
z)-<9))-*!+9hB8Um~y|<AZr@YAZr@YAZr@YAZr@YAZr@YAZr@YAZsF$5-3}OFvZE!
zm<CzXm<CzXm<CzX7!jVJtPaAIhbIlPrZEk&rZEk&rZEk&rZEk&rZEk&CL-EF*%E{)
zPD&;;$eJcJ$eJcJ$eJcJ$eJdII09vL5T-niXpl8cXpl8cXpl8cXpl8cXpl7#Q3T4C
zAWU(zn^KT9adn9iu?}(~2ovp0Tt*@y9b_B`Q*0z6&Oye3FvUh9q8wx#2or52j<G~U
zfdn!Qgo!p0M^Ga5U_i!!FvUh9ieiv)AWX55H1Ig0AO^V*gei6<q8tVp2f`E^NkJaR
z730*;k4Rk?kf%YIVvkclKO%KpKrRGfqFsq2KT<ypA$40o4g_JM9f>2tk$Np4<3N~V
zBdMPsk@_qk7lJUwuB3i`MC!4CTnNGxyOR3(5vjidav=y)>`LnAN2J~g$b}$Gv@3Ds
zN9sp7QeOq+KoBO{kvQ@rQdb3J90*fvB=z$nQbz^kLJ+3dl{APR>er7*y%dnAL6~BX
zQ@_26)JFli5QHgqCH32@)Q=sc4hqQ6AWW&hO%d%?P}doRDK-+3A3?@}FvUh9+8Q9^
zK$v195&02h90*fvBqGK^#(^-!Mk4A*kZ~YPv5|=U2r>?YDK-*OKZ1+{VTz4J<VTQk
zAWXE8IO<14P=YcJ2or50j>^o8`U3`L)X$G*)X$G*)X$G*)X$G*)X$G*h(rWRk|0cR
zBBFkNG^2igG^2igMEDz&7eScPpfsm`el(|kel(|kel(|kel(|kel(|kel(|kel$m<
zOHlR(VT#iw_4A`S_46abk)XT?!j$^kg8KQ<g8KQ<g8KQ<g8KQ<g8KQ<g8KQ<g8KQ<
zg8KQ<0+G}~*&Bo@PU_Upj|dY%c@cyub)+Tr^P?s8^P?s8^P?s8^P?s8^P?s8^P?s8
z^P?s8^P?s8^P?pq_kglD2xG}TpdAPt48p;m{RAD?d?{pNV36fzU{JumO`s$-w;(66
zBvmh~I6n`(r@^((FcyT-Ox_w6?SI=qr0#imL}i;s;jG*D8hBf_v8!BYF$taK>2K}b
zH}@vrV&^~Kcjmqjx_&`q<Gw9+$M63<x3|v9>6>uI3ZBVLE5cajU+CW|uyFai+i?m_
z*AhaTXCJry`^f9egNye%sxG;mY+dwK?9bmX&#XSYkax(;E-6hFN=uxiG-ZyNmR5l3
z>$bznPQ0O0iXXg?7JmEq>d~F&&)Ht}5$0{1k-9l8$3nEB?lSv@+5_GqrZeUjUE{8l
z*k=EQujEb4>-2;bOnDQ%xWWW;L!xJf9khS-;PcV7*>753y_<6;dgDLeon?RSe7*X9
zee#nV%pXJZSIa)weobHB|D^0ryWc_<9nTL|Rz11wW4rZ&ckTka4Pvq{)<*G|%(|+(
zJUfGRPPNgid4gVtZM=)-B=V?lRCf$~A#Sy4Wp(ECli8Upf7uNyTPGWwx@@A|z35KO
zr>W9<k2gH?yQHo8Ea&1X-+R&lKIh)(oQdE~GX1{q*tTM(b;j#Y-cwobc5TVTSu?|D
zlud3vyr`pFr}Hs0SKncclb@4<{M2F>)l^ocr+%ya5`WTu-Tv?G8_hqmo8H^?ZO*;s
z@^ZHHtaM4X_;OeCGuqYfXC)^cY~)E_v}n((Z<iKw7tG(u9QOG3w2p~)cd2qcOXiIT
zxPR<f{CiPm1qa<}DeDqMA9>0wOP%yIQf~UkB}XKdZ`0<oKjz{5Q!M3rXxnKW?b9ay
z-x^NjZaJi?*jG4BR7`sM#JhzO{rZ|)`$eSx{GKPQS5<ScETfs#qdLG_N&0ZsxAe<Q
zi<xKMb{Dzv{qlM-7gqLT4;$Ru4Ueqd{YKTGezpaJUT$;ljnei4bz3Rp)$X+?Uw!!_
z9^lQ$B*%<vV~zxDj}F6<Mi3MCHXT;THXXF&jL&pQ24vHN7#Of_0)kdMq#G}VV*FBE
z#v{zfn$K|u4yyS-ahVUT<5(G(DKg)e5l8SqD?CcgUx>?mXzfR-`Jhc(*b<Kcv?`?3
zd`~7E;RCG~DK&pCF7u(4B&Ftm#AQCT#-!AIS7sdH1Fb$OHGd{9^PzPq>E@#r4{vdp
zkH}tF8@a@0Uk4WK!2{buM7r})i-#$=%tyGNqT=BNF7v5WJlL?}2p=jH4-;^iPo?7F
z2`=-gR6JO);Rqip6%ReQ%%@WE@Bo+jR4N`!*l~mpm5PTB9OfgEKSjmEZCu7vsr8`G
zfjxL&Tbd{>ewuNaj|d-%il6Ja%%@WEpv{RRe5h1B)Z;RrO2xxvT;@}$cu?oU5k6EZ
z9;$JfPo?7FJTCL8R6HniV-Fuh@}#JED8pqum5PT`xXh<g>p_kONAOUocqqbUJ|g@n
zDt?aPGM`GtgA^~0@S#%ikcZ2BDisfhaG6h~;z5iLNBB^wc*w$KK9!1xeK^cVBp-^3
z2SI-9!2??&Nonn#ipzW|6%RXcnNOwSfmZ-W_)w{MNW^76B7P|<ezxK=pGw68ry!2-
zp;GY>i_3f}6%QM6nNOwSfmH}c_)w{Mh{Rz&B6=w*9@gM8p6u2G>d*;;F!ta<q#hET
zk7|4{4&xDtha%&b;V>SNb|^CbCl2EgNryz^B~XT)eMGQ^J)+<w(R9?HpNGSAM8cuS
z_>VY@M^q~m8Sf&BJ#45HIMZ<$kEm2Aa{VhD#v|$!ij21v!yY(fmpP~@X95o6sZ=IB
z#$h~_$^<iU?14k2GNBWP@l+}k?qD~bO6gKh0&Cd7w#Sh^kA|9X8gUp;r840P4&$j*
zCa6nd4;(6meH9MlsZ=JM#bG>^$^-=|?14k2GNBlU@l+}kj^Qw#N@aqiH1@!uQkjs0
z-FPY`oP9V<r&2pXPzGz*z_#mATI8hQFrG?f!gd_SQ>jeglEof4REpYI9L7_rOxS?K
zcq)|%%yQTRhe~Ba7!Kp9R3@y%VLX+}gg@Agr&7%N$zu(A*fv2*Gwnhg#v|GZBo4Wt
zHbcJPFdk7RP-MKj0`|Z`lnE3WKMRNPh%$j9<KN;i9+7e=GTuQEd*C3-1d5EGjKg?D
znLv^8&u|!zC=)0$-ckvB;2_Eb5{*Y4Pw2sJJR;~R>U7@2VLGCnK%(POGp(UA*06zX
zTO@rL9M$+19L7_rOt_B2cq)|%S}NEBhe~BaEe_+UR3==&VLT#XQj~I(Rj~&SmCA%N
z9L7_rOgM?dcq)|%GHO@@2N5<Dl?iz`Os7(ra1e*_RB9)PsACTsDwPT8IE<%KnXn6o
z@l+}kcr~yG4wcG;1RTawsZ7|6!+1oxr>Mwb)5IP)R4Nl9aTrgfGGPsN;}MRhs7&~W
z!*nW@34vNzgWiHlWx^61##5<G_>RMPDwPRd+Smh!N@c=a9L7_rO!$Dqcq)|%&N|ow
zhe~C_G#thwvK2)|&PyD|Q>jd_(Zw1#2-7Jl6Z&wNPNg#8Ar9lIR3@0{VGkQBl?m-Q
zjHgnWa0`d=R4Nm6^|1#ImCA$$9L7_rOt_50cq)|%Y6jQ?he~BaB@W{exfW-UgKL8-
tWY--d0|Ud7#xvNBN87H-%D{|et7?EZD;sD};B<!R3|1Bl4AYH4JOGIi<a7W4

literal 0
HcmV?d00001

diff --git a/tests/processing/test_data.py b/tests/processing/test_data.py
index e56455c6d1..eb2e415027 100644
--- a/tests/processing/test_data.py
+++ b/tests/processing/test_data.py
@@ -109,6 +109,14 @@ def test_processing(processed, no_warn, mails, monkeypatch):
     assert re.search(r'Processing completed', mails.messages[0].data.decode('utf-8')) is not None
 
 
+def test_processing_with_large_dir(test_user, proc_infra):
+    upload_path = 'tests/data/proc/examples_large_dir.zip'
+    upload_id = upload_path[:-4]
+    upload = run_processing((upload_id, upload_path), test_user)
+    for calc in upload.calcs:
+        assert len(calc.warnings) == 1
+
+
 def test_publish(non_empty_processed: Upload, no_warn, example_user_metadata, with_publish_to_coe_repo, monkeypatch):
     processed = non_empty_processed
     processed.compress_and_set_metadata(example_user_metadata)
diff --git a/tests/test_api.py b/tests/test_api.py
index acc8a947ac..67ba51f68c 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -460,6 +460,7 @@ class TestUploads:
     #     assert rv.status_code == 400
 
     def test_potcar(self, client, proc_infra, test_user_auth):
+        # only the owner, shared with people are supposed to download the original potcar file
         example_file = 'tests/data/proc/examples_potcar.zip'
         rv = client.put('/uploads/?local_path=%s' % example_file, headers=test_user_auth)
 
-- 
GitLab