diff --git a/core/src/main/scala/eu/nomad_lab/ParquetSchemaGenerator.scala b/core/src/main/scala/eu/nomad_lab/ParquetSchemaGenerator.scala index 8339388f70c869511ad8e7f83c6038ddd409e275..ced0a48a23d5e32fe3b71a80a89a57c531c319cd 100644 --- a/core/src/main/scala/eu/nomad_lab/ParquetSchemaGenerator.scala +++ b/core/src/main/scala/eu/nomad_lab/ParquetSchemaGenerator.scala @@ -10,10 +10,10 @@ import org.apache.parquet.schema.MessageType import org.apache.parquet.schema.PrimitiveType import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName import org.apache.parquet.schema.Type.Repetition -import eu.nomad_lab.meta.{KnownMetaInfoEnvs, MetaInfoRecord} +import eu.nomad_lab.meta.{ KnownMetaInfoEnvs, MetaInfoRecord } import eu.nomad_lab.parsers.JsonWriterBackend -import scala.collection.{SortedSet, mutable} +import scala.collection.{ SortedSet, mutable } case class SchemaAndIndex(schemaList: List[Type], indexMap: Map[String, Int]) @@ -24,19 +24,20 @@ class ParquetSchemaGenerator { val rootSections = JsonWriterBackend.defaultRootSections val si = for (r <- rootSections) yield (r, createSchema(metaInfo = r, startIndex = 1)) val s = for (x <- si) yield new GroupType(REPEATED, x._1, new PrimitiveType(REQUIRED, PrimitiveTypeName.INT64, "c_index") :: x._2) - val schema = new MessageType("File", + val schema = new MessageType( + "File", new GroupType(REPEATED, "archive_context", new PrimitiveType(REQUIRED, PrimitiveTypeName.INT64, "c_index"), new PrimitiveType(REQUIRED, PrimitiveTypeName.BINARY, "archive_gid"), new GroupType(REPEATED, "calculation_context", new PrimitiveType(REQUIRED, PrimitiveTypeName.INT64, "c_index") :: - new PrimitiveType(REQUIRED, PrimitiveTypeName.BINARY, "calculation_gid") :: s.toList) - )) - val index: Map[String, Int] = Map(("archive_context" -> 0),("archive_gid" -> 1), ("calculation_context" -> 2), + new PrimitiveType(REQUIRED, PrimitiveTypeName.BINARY, "calculation_gid") :: s.toList)) + ) + val index: Map[String, Int] = Map(("archive_context" -> 0), ("archive_gid" -> 1), ("calculation_context" -> 2), ("calculation_gid" -> 1), ("shape" -> 0), ("values" -> 1), ("c_index" -> 0)) // Shape, values and c_index are the only non-unique items in the schema. Separately handled here with the top. -// si.foreach((s) => println(s._2.indexMap)) - val x= (schema, index ++ indexMap ++ rootSections.zipWithIndex.map{case (a,b)=> (a -> (b+2))})//si.foldLeft(index)((m, y) => m ++ y._2.indexMap ++ rootSections.zipWithIndex.map{case (a,b)=> (a -> (b+2))} )) -// println(x._2) + // si.foreach((s) => println(s._2.indexMap)) + val x = (schema, index ++ indexMap ++ rootSections.zipWithIndex.map { case (a, b) => (a -> (b + 2)) }) //si.foldLeft(index)((m, y) => m ++ y._2.indexMap ++ rootSections.zipWithIndex.map{case (a,b)=> (a -> (b+2))} )) + // println(x._2) x } @@ -51,7 +52,7 @@ class ParquetSchemaGenerator { case Some("C") => PrimitiveTypeName.BINARY case Some("D") => PrimitiveTypeName.BINARY case Some("b") => PrimitiveTypeName.BOOLEAN - case Some("r") => PrimitiveTypeName.INT32 + case Some("r") => PrimitiveTypeName.INT64 case _ => throw new Exception } } @@ -95,15 +96,15 @@ class ParquetSchemaGenerator { cDone += now if (mi.kindStr == "type_abstract_document_content") { cAbstract += now - cToDo ++= all.allDirectChildrenOf(now).filter{ x => !cDone(x) } + cToDo ++= all.allDirectChildrenOf(now).filter { x => !cDone(x) } } } - val valueSet:SortedSet[String] = cDone.filter(metaInfo => all.metaInfoRecordForName(metaInfo).get.kindStr == "type_document_content" ) - val sectionSet:SortedSet[String] = cDone.filter(metaInfo => all.metaInfoRecordForName(metaInfo).get.kindStr == "type_section") + val valueSet: SortedSet[String] = cDone.filter(metaInfo => all.metaInfoRecordForName(metaInfo).get.kindStr == "type_document_content") + val sectionSet: SortedSet[String] = cDone.filter(metaInfo => all.metaInfoRecordForName(metaInfo).get.kindStr == "type_section") val sectionStartIndex = startIndex + valueSet.size - indexMap = indexMap ++ valueSet.zipWithIndex.map{ case (x,i) => x -> (i + startIndex)} ++ sectionSet.zipWithIndex.map{ case (x,i) => x -> (i + sectionStartIndex)} + indexMap = indexMap ++ valueSet.zipWithIndex.map { case (x, i) => x -> (i + startIndex) } ++ sectionSet.zipWithIndex.map { case (x, i) => x -> (i + sectionStartIndex) } println(s"cToDo: ${cToDo}, cDone: ${cDone}, valueSet: ${valueSet}, sectionSet: ${sectionSet}") -// println("Newly added indices: "+ cSeparated.zipWithIndex.map{ case (x,i) => x -> (i + startIndex)}) + // println("Newly added indices: "+ cSeparated.zipWithIndex.map{ case (x,i) => x -> (i + startIndex)}) val valueSchema = valueSet.foldLeft(List[Type]())((x, y) => { val mi = all.metaInfoRecordForName(y).get if (mi.shape.getOrElse(Seq()).isEmpty) x :+ scalarValue(mi) else x :+ arrayValue(mi)