@Override public ReadContext init(InitContext context) { MessageType requestedProjection = getSchemaForRead(context.getFileSchema(), getPartialReadSchema(context)); return new ReadContext(requestedProjection); }
@Override public ReadContext init(InitContext context) { MessageType requestedProjection = getSchemaForRead(context.getFileSchema(), getPartialReadSchema(context)); return new ReadContext(requestedProjection); } }
/** * It creates the readContext for Parquet side with the requested schema during the init phase. * * @param context * @return the parquet ReadContext */ @Override public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(InitContext context) { Configuration configuration = context.getConfiguration(); MessageType fileSchema = context.getFileSchema(); String columnNames = configuration.get(IOConstants.COLUMNS); Map<String, String> contextMetadata = new HashMap<String, String>(); boolean indexAccess = configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false); if (columnNames != null) { List<String> columnNamesList = getColumnNames(columnNames); String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES); List<TypeInfo> columnTypesList = getColumnTypes(columnTypes); MessageType tableSchema = getRequestedSchemaForIndexAccess(indexAccess, columnNamesList, columnTypesList, fileSchema); contextMetadata.put(HIVE_TABLE_AS_PARQUET_SCHEMA, tableSchema.toString()); contextMetadata.put(PARQUET_COLUMN_INDEX_ACCESS, String.valueOf(indexAccess)); this.hiveTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNamesList, columnTypesList); return new ReadContext(getRequestedPrunedSchema(columnNamesList, tableSchema, configuration), contextMetadata); } else { contextMetadata.put(HIVE_TABLE_AS_PARQUET_SCHEMA, fileSchema.toString()); return new ReadContext(fileSchema, contextMetadata); } }
MessageType fullSchema = context.getFileSchema();
MessageType fullSchema = context.getFileSchema();
@Override public ReadContext init(final InitContext context) { return new ReadContext(context.getFileSchema()); } }
@Override public ReadContext init(InitContext context) { return new ReadContext(context.getFileSchema()); }
@Override public ReadContext init(InitContext context) { String requestedProjectionString = context.getConfiguration().get(PB_REQUESTED_PROJECTION); if (requestedProjectionString != null && !requestedProjectionString.trim().isEmpty()) { MessageType requestedProjection = getSchemaForRead(context.getFileSchema(), requestedProjectionString); LOG.debug("Reading data with projection {}", requestedProjection); return new ReadContext(requestedProjection); } else { MessageType fileSchema = context.getFileSchema(); LOG.debug("Reading data with schema {}", fileSchema); return new ReadContext(fileSchema); } }
/** * Called on executor side before {@link #prepareForRead(Configuration, Map, MessageType, ReadContext)} and * instantiating actual Parquet record readers. * Responsible for figuring out Parquet requested schema used for column pruning. */ @Override public ReadContext init(InitContext context) { MessageType requestedSchema = clipParquetSchema(context.getFileSchema().asGroupType(), fieldNames); return new ReadContext(requestedSchema, new HashMap<String, String>()); }
/** * called in {@link org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)} in the front end * * @param context the initialisation context * @return the readContext that defines how to read the file */ public ReadContext init(InitContext context) { return init(context.getConfiguration(), context.getMergedKeyValueMetaData(), context.getFileSchema()); }
/** * called in {@link org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)} in the front end * * @param context the initialisation context * @return the readContext that defines how to read the file */ public ReadContext init(InitContext context) { return init(context.getConfiguration(), context.getMergedKeyValueMetaData(), context.getFileSchema()); }
@Override public ReadContext init( InitContext context ) { String schemaStr = context.getConfiguration().get( ParquetConverter.PARQUET_SCHEMA_CONF_KEY ); if ( schemaStr == null ) { throw new RuntimeException( "Schema not defined in the PentahoParquetSchema key" ); } ParquetInputFieldList schema = ParquetInputFieldList.unmarshall( schemaStr ); converter = new ParquetConverter( schema.getFields() ); // get all fields from file's schema MessageType fileSchema = context.getFileSchema(); List<Type> newFields = new ArrayList<>(); // use only required fields for ( IParquetInputField f : schema ) { Type origField = fileSchema.getFields().get( fileSchema.getFieldIndex( f.getFormatFieldName() ) ); newFields.add( origField ); } if ( newFields.isEmpty() ) { throw new RuntimeException( "Fields should be declared" ); } MessageType newSchema = new MessageType( fileSchema.getName(), newFields ); return new ReadContext( newSchema, new HashMap<>() ); }
@Override public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(InitContext context) { final Configuration configuration = context.getConfiguration(); final MessageType fileMessageType = context.getFileSchema(); MessageType requestedProjection = fileMessageType; String partialSchemaString = configuration.get(ReadSupport.PARQUET_READ_SCHEMA); FieldProjectionFilter projectionFilter = getFieldProjectionFilter(configuration); if (partialSchemaString != null && projectionFilter != null) { throw new ThriftProjectionException( String.format("You cannot provide both a partial schema and field projection filter." + "Only one of (%s, %s, %s) should be set.", PARQUET_READ_SCHEMA, STRICT_THRIFT_COLUMN_FILTER_KEY, THRIFT_COLUMN_FILTER_KEY)); } //set requestedProjections only when it's specified if (partialSchemaString != null) { requestedProjection = getSchemaForRead(fileMessageType, partialSchemaString); } else if (projectionFilter != null) { try { initThriftClassFromMultipleFiles(context.getKeyValueMetadata(), configuration); requestedProjection = getProjectedSchema(projectionFilter); } catch (ClassNotFoundException e) { throw new ThriftProjectionException("can not find thriftClass from configuration", e); } } MessageType schemaForRead = getSchemaForRead(fileMessageType, requestedProjection); return new ReadContext(schemaForRead); }