From a6456e2df2a9f6a25f7a7cfb248d9be7198956f1 Mon Sep 17 00:00:00 2001 From: Markus Scheidgen <markus.scheidgen@gmail.com> Date: Fri, 19 Jul 2019 11:47:19 +0200 Subject: [PATCH] Improved the auxfile handling. Some fixes to rawfile api. Completes implementation of #171. --- gui/src/components/uploads/Upload.js | 20 ++++++-- nomad/api/raw.py | 14 ++++-- nomad/config.py | 2 +- nomad/datamodel/base.py | 1 - nomad/files.py | 33 ++++++++----- nomad/processing/data.py | 66 +++++++++++++------------ tests/data/proc/examples_large_dir.zip | Bin 0 -> 27623 bytes tests/processing/test_data.py | 8 +++ tests/test_api.py | 1 + 9 files changed, 91 insertions(+), 54 deletions(-) create mode 100644 tests/data/proc/examples_large_dir.zip diff --git a/gui/src/components/uploads/Upload.js b/gui/src/components/uploads/Upload.js index 7f09f82525..e944f0bf4f 100644 --- a/gui/src/components/uploads/Upload.js +++ b/gui/src/components/uploads/Upload.js @@ -354,7 +354,20 @@ class Upload extends React.Component { const renderRow = (calc, index) => { const { mainfile, upload_id, calc_id, parser, tasks, current_task, tasks_status, errors } = calc - const color = tasks_status === 'FAILURE' ? 'error' : 'default' + let tooltip_html = null + let color = tasks_status === 'FAILURE' ? 'error' : 'default' + if (tasks_status === 'FAILURE') { + tooltip_html = `Calculation processing failed with errors: ${errors.join(', ')}` + color = 'error' + } + if (calc.errors.length > 0) { + color = 'error' + tooltip_html = `Calculation processed with errors: ${calc.errors.join(', ')}` + } + if (calc.warnings.length > 0) { + color = 'error' + tooltip_html = `Calculation processed with warnings: ${calc.warnings.join(', ')}` + } const processed = tasks_status === 'FAILURE' || tasks_status === 'SUCCESS' const row = ( <TableRow key={index} hover={processed} @@ -391,10 +404,9 @@ class Upload extends React.Component { </TableRow> ) - if (tasks_status === 'FAILURE') { - const error_html = `Calculation processing failed with errors: ${errors.join(', ')}` + if (tooltip_html) { return ( - <Tooltip key={calc_id} title={error_html}> + <Tooltip key={calc_id} title={tooltip_html}> {row} </Tooltip> ) diff --git a/nomad/api/raw.py b/nomad/api/raw.py index 145170dd6b..d7930b72cc 100644 --- a/nomad/api/raw.py +++ b/nomad/api/raw.py @@ -100,7 +100,8 @@ class FileView: return self.f.read(size) -def get_raw_file_from_upload_path(upload_files, upload_filepath, authorization_predicate): +def get_raw_file_from_upload_path( + upload_files, upload_filepath, authorization_predicate, mainfile: str = None): """ Helper method used by func:`RawFileFromUploadPathResource.get` and func:`RawFileFromCalcPathResource.get`. @@ -148,11 +149,14 @@ def get_raw_file_from_upload_path(upload_files, upload_filepath, authorization_p directory_files = upload_files.raw_file_list(upload_filepath) if len(directory_files) == 0: abort(404, message='There is nothing to be found at %s.' % upload_filepath) + + contents = sorted([dict(name=name, size=size) for name, size in directory_files], key=lambda x: '' if x['name'] == mainfile else x['name']) + # if mainfile is not None: + # contents = [mainfile] + [content for content in contents if content['name'] != mainfile] return { 'upload_id': upload_files.upload_id, 'directory': upload_filepath, - 'contents': [ - dict(name=name, size=size) for name, size in directory_files] + 'contents': contents }, 200 @@ -252,7 +256,9 @@ class RawFileFromCalcPathResource(Resource): abort(404, message='The calc with id %s is not part of the upload with id %s.' % (calc_id, upload_id)) upload_filepath = os.path.join(os.path.dirname(calc.mainfile), calc_filepath) - return get_raw_file_from_upload_path(upload_files, upload_filepath, authorization_predicate) + return get_raw_file_from_upload_path( + upload_files, upload_filepath, authorization_predicate, + mainfile=os.path.basename(calc.mainfile)) @ns.route('/calc/<string:upload_id>/<string:calc_id>/') diff --git a/nomad/config.py b/nomad/config.py index 77df549174..ac13f62664 100644 --- a/nomad/config.py +++ b/nomad/config.py @@ -176,7 +176,7 @@ commit = gitinfo.commit release = 'devel' domain = 'DFT' service = 'unknown nomad service' -auxfile_cutoff = 30 +auxfile_cutoff = 100 console_log_level = logging.WARNING diff --git a/nomad/datamodel/base.py b/nomad/datamodel/base.py index b8f49a7740..deae57fecb 100644 --- a/nomad/datamodel/base.py +++ b/nomad/datamodel/base.py @@ -85,7 +85,6 @@ class CalcWithMetadata(): # basic upload and processing related metadata self.upload_time: datetime.datetime = None self.files: List[str] = None - self.file_sizes: List[int] = None self.uploader: utils.POPO = None self.processed: bool = False self.last_processing: datetime.datetime = None diff --git a/nomad/files.py b/nomad/files.py index 71e1b2e0a7..01e250e23f 100644 --- a/nomad/files.py +++ b/nomad/files.py @@ -468,9 +468,11 @@ class StagingUploadFiles(UploadFiles): if not calc.with_embargo: mainfile = calc.mainfile assert mainfile is not None - for filepath in self.calc_files(mainfile): - if not always_restricted(filepath): - public_files[filepath] = None + # mainfile might already have been added due to being a auxfile to another calc + if mainfile not in public_files: + for filepath in self.calc_files(mainfile, with_cutoff=False): + if not always_restricted(filepath): + public_files[filepath] = None # 1.2 remove the non public mainfiles that have been added as auxfiles of public mainfiles for calc in upload.calcs: if calc.with_embargo: @@ -542,7 +544,7 @@ class StagingUploadFiles(UploadFiles): return results - def calc_files(self, mainfile: str, with_mainfile: bool = True) -> Iterable[str]: + def calc_files(self, mainfile: str, with_mainfile: bool = True, with_cutoff: bool = True) -> Iterable[str]: """ Returns all the auxfiles and mainfile for a given mainfile. This implements nomad's logic about what is part of a calculation and what not. The mainfile @@ -559,16 +561,21 @@ class StagingUploadFiles(UploadFiles): calc_dir = os.path.dirname(mainfile_object.os_path) calc_relative_dir = calc_dir[len(self._raw_dir.os_path) + 1:] - ls = os.listdir(calc_dir) - if len(ls) > config.auxfile_cutoff: - # If there are two many of them, its probably just a directory with lots of - # calculations. In this case it does not make any sense to provide thousands of - # aux files. - return [mainfile] if with_mainfile else [] + file_count = 0 + aux_files: List[str] = [] + for filename in os.listdir(calc_dir): + if filename != mainfile_basename and os.path.isfile(os.path.join(calc_dir, filename)): + aux_files.append(os.path.join(calc_relative_dir, filename)) + file_count += 1 + + if with_cutoff and file_count > config.auxfile_cutoff: + # If there are two many of them, its probably just a directory with lots of + # calculations. In this case it does not make any sense to provide thousands of + # aux files. + break + + aux_files = sorted(aux_files) - aux_files = sorted( - os.path.join(calc_relative_dir, path) for path in ls - if os.path.isfile(os.path.join(calc_dir, path)) and path != mainfile_basename) if with_mainfile: return [mainfile] + aux_files else: diff --git a/nomad/processing/data.py b/nomad/processing/data.py index d54cd60763..53ae8f660e 100644 --- a/nomad/processing/data.py +++ b/nomad/processing/data.py @@ -164,9 +164,13 @@ class Calc(Proc): calc_with_metadata.nomad_commit = config.commit calc_with_metadata.last_processing = datetime.now() calc_with_metadata.files = self.upload_files.calc_files(self.mainfile) - self.preprocess_files(calc_with_metadata.files) self.metadata = calc_with_metadata.to_dict() + if len(calc_with_metadata.files) >= config.auxfile_cutoff: + self.warning( + 'This calc has many aux files in its directory. ' + 'Have you placed many calculations in the same directory?') + self.parsing() self.normalizing() self.archiving() @@ -215,36 +219,6 @@ class Calc(Proc): self.upload.reload() self.upload.check_join() - def preprocess_files(self, filepaths): - for path in filepaths: - if os.path.basename(path) == 'POTCAR': - # create checksum - hash = hashlib.sha224() - with open(self.upload_files.raw_file_object(path).os_path, 'rb') as f: - for line in f.readlines(): - hash.update(line) - - checksum = hash.hexdigest() - - # created stripped POTCAR - stripped_path = path + '.stripped' - with open(self.upload_files.raw_file_object(stripped_path).os_path, 'wt') as f: - f.write('Stripped POTCAR file. Checksum of original file (sha224): %s\n' % checksum) - os.system( - ''' - awk < %s >> %s ' - BEGIN { dump=1 } - /End of Dataset/ { dump=1 } - dump==1 { print } - /END of PSCTR/ { dump=0 }' - ''' % ( - self.upload_files.raw_file_object(path).os_path, - self.upload_files.raw_file_object(stripped_path).os_path)) - - filepaths.append(stripped_path) - - return filepaths - @task def parsing(self): context = dict(parser=self.parser, step=self.parser) @@ -652,6 +626,35 @@ class Upload(Proc): self.fail('bad .zip/.tar file', log_level=logging.INFO) return + def _preprocess_files(self, path): + """ + Some files need preprocessing. Currently we need to add a stripped POTCAR version + and always restrict/embargo the original. + """ + if os.path.basename(path) == 'POTCAR': + # create checksum + hash = hashlib.sha224() + with open(self.staging_upload_files.raw_file_object(path).os_path, 'rb') as orig_f: + for line in orig_f.readlines(): + hash.update(line) + + checksum = hash.hexdigest() + + # created stripped POTCAR + stripped_path = path + '.stripped' + with open(self.staging_upload_files.raw_file_object(stripped_path).os_path, 'wt') as stripped_f: + stripped_f.write('Stripped POTCAR file. Checksum of original file (sha224): %s\n' % checksum) + os.system( + ''' + awk < %s >> %s ' + BEGIN { dump=1 } + /End of Dataset/ { dump=1 } + dump==1 { print } + /END of PSCTR/ { dump=0 }' + ''' % ( + self.staging_upload_files.raw_file_object(path).os_path, + self.staging_upload_files.raw_file_object(stripped_path).os_path)) + def match_mainfiles(self) -> Generator[Tuple[str, object], None, None]: """ Generator function that matches all files in the upload to all parsers to @@ -663,6 +666,7 @@ class Upload(Proc): directories_with_match: Dict[str, str] = dict() upload_files = self.staging_upload_files for filename in upload_files.raw_file_manifest(): + self._preprocess_files(filename) try: parser = match_parser(filename, upload_files) if parser is not None: diff --git a/tests/data/proc/examples_large_dir.zip b/tests/data/proc/examples_large_dir.zip new file mode 100644 index 0000000000000000000000000000000000000000..dbf3663359d8e8ec414bcf434d776ae5d8b43fd2 GIT binary patch literal 27623 zcmWIWW@h1H00EZZ&wd~ph9wwe7*Z<|a|?1(i{o<=i_%l$Q!<P64fPUBD?&p!8JN}U z3}ZpKw1S&~k>x8R0|Sc)0|NtY6D4t&Xh6P^Qn-vnn5j``7z@HgI}@jw2(N?817XU| zG=lpbWF81pZl*B}%rv2anWi)_(~JgYn$y5c3mTYdNxqr5vL+(QgK|0uQ=B{vX^=Gy z5$*(KArPkAoixarhBU~UhBU~UhBU~UhBU~UhBU~UhBU~UhBU~Uh~x>%mLN=Vk~gA3 z)-<9))-*!+9hB8Um~y|<AZr@YAZr@YAZr@YAZr@YAZr@YAZr@YAZsF$5-3}OFvZE! zm<CzXm<CzXm<CzX7!jVJtPaAIhbIlPrZEk&rZEk&rZEk&rZEk&rZEk&CL-EF*%E{) zPD&;;$eJcJ$eJcJ$eJcJ$eJdII09vL5T-niXpl8cXpl8cXpl8cXpl8cXpl7#Q3T4C zAWU(zn^KT9adn9iu?}(~2ovp0Tt*@y9b_B`Q*0z6&Oye3FvUh9q8wx#2or52j<G~U zfdn!Qgo!p0M^Ga5U_i!!FvUh9ieiv)AWX55H1Ig0AO^V*gei6<q8tVp2f`E^NkJaR z730*;k4Rk?kf%YIVvkclKO%KpKrRGfqFsq2KT<ypA$40o4g_JM9f>2tk$Np4<3N~V zBdMPsk@_qk7lJUwuB3i`MC!4CTnNGxyOR3(5vjidav=y)>`LnAN2J~g$b}$Gv@3Ds zN9sp7QeOq+KoBO{kvQ@rQdb3J90*fvB=z$nQbz^kLJ+3dl{APR>er7*y%dnAL6~BX zQ@_26)JFli5QHgqCH32@)Q=sc4hqQ6AWW&hO%d%?P}doRDK-+3A3?@}FvUh9+8Q9^ zK$v195&02h90*fvBqGK^#(^-!Mk4A*kZ~YPv5|=U2r>?YDK-*OKZ1+{VTz4J<VTQk zAWXE8IO<14P=YcJ2or50j>^o8`U3`L)X$G*)X$G*)X$G*)X$G*)X$G*h(rWRk|0cR zBBFkNG^2igG^2igMEDz&7eScPpfsm`el(|kel(|kel(|kel(|kel(|kel(|kel$m< zOHlR(VT#iw_4A`S_46abk)XT?!j$^kg8KQ<g8KQ<g8KQ<g8KQ<g8KQ<g8KQ<g8KQ< zg8KQ<0+G}~*&Bo@PU_Upj|dY%c@cyub)+Tr^P?s8^P?s8^P?s8^P?s8^P?s8^P?s8 z^P?s8^P?s8^P?pq_kglD2xG}TpdAPt48p;m{RAD?d?{pNV36fzU{JumO`s$-w;(66 zBvmh~I6n`(r@^((FcyT-Ox_w6?SI=qr0#imL}i;s;jG*D8hBf_v8!BYF$taK>2K}b zH}@vrV&^~Kcjmqjx_&`q<Gw9+$M63<x3|v9>6>uI3ZBVLE5cajU+CW|uyFai+i?m_ z*AhaTXCJry`^f9egNye%sxG;mY+dwK?9bmX&#XSYkax(;E-6hFN=uxiG-ZyNmR5l3 z>$bznPQ0O0iXXg?7JmEq>d~F&&)Ht}5$0{1k-9l8$3nEB?lSv@+5_GqrZeUjUE{8l z*k=EQujEb4>-2;bOnDQ%xWWW;L!xJf9khS-;PcV7*>753y_<6;dgDLeon?RSe7*X9 zee#nV%pXJZSIa)weobHB|D^0ryWc_<9nTL|Rz11wW4rZ&ckTka4Pvq{)<*G|%(|+( zJUfGRPPNgid4gVtZM=)-B=V?lRCf$~A#Sy4Wp(ECli8Upf7uNyTPGWwx@@A|z35KO zr>W9<k2gH?yQHo8Ea&1X-+R&lKIh)(oQdE~GX1{q*tTM(b;j#Y-cwobc5TVTSu?|D zlud3vyr`pFr}Hs0SKncclb@4<{M2F>)l^ocr+%ya5`WTu-Tv?G8_hqmo8H^?ZO*;s z@^ZHHtaM4X_;OeCGuqYfXC)^cY~)E_v}n((Z<iKw7tG(u9QOG3w2p~)cd2qcOXiIT zxPR<f{CiPm1qa<}DeDqMA9>0wOP%yIQf~UkB}XKdZ`0<oKjz{5Q!M3rXxnKW?b9ay z-x^NjZaJi?*jG4BR7`sM#JhzO{rZ|)`$eSx{GKPQS5<ScETfs#qdLG_N&0ZsxAe<Q zi<xKMb{Dzv{qlM-7gqLT4;$Ru4Ueqd{YKTGezpaJUT$;ljnei4bz3Rp)$X+?Uw!!_ z9^lQ$B*%<vV~zxDj}F6<Mi3MCHXT;THXXF&jL&pQ24vHN7#Of_0)kdMq#G}VV*FBE z#v{zfn$K|u4yyS-ahVUT<5(G(DKg)e5l8SqD?CcgUx>?mXzfR-`Jhc(*b<Kcv?`?3 zd`~7E;RCG~DK&pCF7u(4B&Ftm#AQCT#-!AIS7sdH1Fb$OHGd{9^PzPq>E@#r4{vdp zkH}tF8@a@0Uk4WK!2{buM7r})i-#$=%tyGNqT=BNF7v5WJlL?}2p=jH4-;^iPo?7F z2`=-gR6JO);Rqip6%ReQ%%@WE@Bo+jR4N`!*l~mpm5PTB9OfgEKSjmEZCu7vsr8`G zfjxL&Tbd{>ewuNaj|d-%il6Ja%%@WEpv{RRe5h1B)Z;RrO2xxvT;@}$cu?oU5k6EZ z9;$JfPo?7FJTCL8R6HniV-Fuh@}#JED8pqum5PT`xXh<g>p_kONAOUocqqbUJ|g@n zDt?aPGM`GtgA^~0@S#%ikcZ2BDisfhaG6h~;z5iLNBB^wc*w$KK9!1xeK^cVBp-^3 z2SI-9!2??&Nonn#ipzW|6%RXcnNOwSfmZ-W_)w{MNW^76B7P|<ezxK=pGw68ry!2- zp;GY>i_3f}6%QM6nNOwSfmH}c_)w{Mh{Rz&B6=w*9@gM8p6u2G>d*;;F!ta<q#hET zk7|4{4&xDtha%&b;V>SNb|^CbCl2EgNryz^B~XT)eMGQ^J)+<w(R9?HpNGSAM8cuS z_>VY@M^q~m8Sf&BJ#45HIMZ<$kEm2Aa{VhD#v|$!ij21v!yY(fmpP~@X95o6sZ=IB z#$h~_$^<iU?14k2GNBWP@l+}k?qD~bO6gKh0&Cd7w#Sh^kA|9X8gUp;r840P4&$j* zCa6nd4;(6meH9MlsZ=JM#bG>^$^-=|?14k2GNBlU@l+}kj^Qw#N@aqiH1@!uQkjs0 z-FPY`oP9V<r&2pXPzGz*z_#mATI8hQFrG?f!gd_SQ>jeglEof4REpYI9L7_rOxS?K zcq)|%%yQTRhe~Ba7!Kp9R3@y%VLX+}gg@Agr&7%N$zu(A*fv2*Gwnhg#v|GZBo4Wt zHbcJPFdk7RP-MKj0`|Z`lnE3WKMRNPh%$j9<KN;i9+7e=GTuQEd*C3-1d5EGjKg?D znLv^8&u|!zC=)0$-ckvB;2_Eb5{*Y4Pw2sJJR;~R>U7@2VLGCnK%(POGp(UA*06zX zTO@rL9M$+19L7_rOt_B2cq)|%S}NEBhe~BaEe_+UR3==&VLT#XQj~I(Rj~&SmCA%N z9L7_rOgM?dcq)|%GHO@@2N5<Dl?iz`Os7(ra1e*_RB9)PsACTsDwPT8IE<%KnXn6o z@l+}kcr~yG4wcG;1RTawsZ7|6!+1oxr>Mwb)5IP)R4Nl9aTrgfGGPsN;}MRhs7&~W z!*nW@34vNzgWiHlWx^61##5<G_>RMPDwPRd+Smh!N@c=a9L7_rO!$Dqcq)|%&N|ow zhe~C_G#thwvK2)|&PyD|Q>jd_(Zw1#2-7Jl6Z&wNPNg#8Ar9lIR3@0{VGkQBl?m-Q zjHgnWa0`d=R4Nm6^|1#ImCA$$9L7_rOt_50cq)|%Y6jQ?he~BaB@W{exfW-UgKL8- tWY--d0|Ud7#xvNBN87H-%D{|et7?EZD;sD};B<!R3|1Bl4AYH4JOGIi<a7W4 literal 0 HcmV?d00001 diff --git a/tests/processing/test_data.py b/tests/processing/test_data.py index e56455c6d1..eb2e415027 100644 --- a/tests/processing/test_data.py +++ b/tests/processing/test_data.py @@ -109,6 +109,14 @@ def test_processing(processed, no_warn, mails, monkeypatch): assert re.search(r'Processing completed', mails.messages[0].data.decode('utf-8')) is not None +def test_processing_with_large_dir(test_user, proc_infra): + upload_path = 'tests/data/proc/examples_large_dir.zip' + upload_id = upload_path[:-4] + upload = run_processing((upload_id, upload_path), test_user) + for calc in upload.calcs: + assert len(calc.warnings) == 1 + + def test_publish(non_empty_processed: Upload, no_warn, example_user_metadata, with_publish_to_coe_repo, monkeypatch): processed = non_empty_processed processed.compress_and_set_metadata(example_user_metadata) diff --git a/tests/test_api.py b/tests/test_api.py index acc8a947ac..67ba51f68c 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -460,6 +460,7 @@ class TestUploads: # assert rv.status_code == 400 def test_potcar(self, client, proc_infra, test_user_auth): + # only the owner, shared with people are supposed to download the original potcar file example_file = 'tests/data/proc/examples_potcar.zip' rv = client.put('/uploads/?local_path=%s' % example_file, headers=test_user_auth) -- GitLab