Code examplesRaw file parsingTransforms

Transforms

Python

Parse excel file with Python

How can I read and process complex Excel files with dynamic schema in a distributed manner using PySpark and Openpyxl?

This code uses PySpark and the Openpyxl library to read multiple Excel files from an input filesystem, parse their content, and convert them into PySpark DataFrames. The DataFrames are then combined into a single DataFrame and written to the output.

Copied!
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 from pyspark.sql import functions as F, types as T, DataFrame from transforms.api import transform, Input, Output, configure import tempfile import shutil import openpyxl import functools @transform( processed_excel=Output("example_processed_dataframe"), excel_input=Input("example_excel_dataframe"), ) def compute(ctx, processed_excel, excel_input): def parse_file(file_status): # Open the Excel file with excel_input.filesystem().open(file_status.path, "rb") as in_xlsx: # Create a temporary file for processing with tempfile.NamedTemporaryFile(suffix=".xlsx") as tmp_xlsx: shutil.copyfileobj(in_xlsx, tmp_xlsx) tmp_xlsx.flush() # Load the Excel workbook and parse its content try: workbook = openpyxl.load_workbook(tmp_xlsx.name) return parse_workbook(workbook) except: return None # Get the list of Excel files from the input filesystem files_df = excel_input.filesystem().files() # Parse each file using the 'parse_file' function parsed_files = files_df.rdd.map(parse_file).collect() # Convert the parsed files to PySpark dataframes dfs = [] for parsed_file in parsed_files: dfs.append(convert_to_df(ctx, parsed_file)) # Union the dataframes and write the result to the output df = functools.reduce(lambda df1, df2: df1.unionByName(df2, allowMissingColumns=True), dfs) processed_excel.write_dataframe(df)
  • Date submitted: 2024-08-12
  • Tags: code authoring, code repositories, python, openpyxl

Combine shapefiles and convert to GeoJSON

How do I combine multiple shapefiles and convert them to GeoJSON format?

This code uses the geospatial_tools library to read multiple shapefiles, convert their geometries to GeoJSON format, and combine them into a single PySpark DataFrame. It also computes the centroid of each geometry and converts it to a geohash.

Copied!
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 from transforms.api import transform, Input, Output from geospatial_tools import geospatial from geospatial_tools.functions import clean_geometry, centroid, geojson_to_geohash import tempfile import shutil import geopandas as gpd from pyspark.sql import types as T from pyspark.sql import functions as F import json from shapely.geometry import mapping @geospatial() @transform( output=Output(), input_data=Input(), ) def compute(ctx, input_data, output): fs = input_data.filesystem() schema = T.StructType([T.StructField("geoshape", T.StringType()), T.StructField("name", T.StringType()), T.StructField("centroid", T.StringType())]) shapefiles = [f.path.replace('.shp', '') for f in fs.ls(glob='*shp')] combined_data = ctx.spark_session.createDataFrame([], schema) for shapefile in shapefiles: # NOQA with tempfile.TemporaryDirectory() as tmp_dir: # Copy all files for the shapefile to the local filesystem # There are multiple files associated with a shapefile, such as .prj and .cpg for shapefile_file in fs.ls(glob=f'{shapefile}.*'): with open(f'{tmp_dir}/{shapefile_file.path}', 'wb') as tmp_file: with fs.open(shapefile_file.path, 'rb') as f: shutil.copyfileobj(f, tmp_file) # Create a GeoJSON geometry column pdf = gpd.read_file(f'{tmp_dir}/{shapefile}.shp') pdf['geometry'] = pdf.geometry.apply(lambda x: json.dumps(mapping(x))) df = ctx.spark_session.createDataFrame(pdf) # Convert everything to EPSG:4326 format expected by Foundry crs = gpd.read_file(f'{tmp_dir}/{shapefile}.shp').crs.to_string() df = df.withColumn( "geoshape", clean_geometry('geometry', crs, lat_long=(crs != "EPSG:4326")) ).select("geoshape") df = df.withColumn('name', F.lit(shapefile)) df = df.withColumn('centroid', geojson_to_geohash(centroid('geoshape'))) combined_data = combined_data.unionByName(df) return output.write_dataframe(combined_data)
  • Date submitted: 2024-05-23
  • Tags: geospatial, shapefile, geojson, geohash, pyspark, geopandas

Copy raw files between datasets

How can I copy raw files from an input dataset to an output dataset within a Python authoring transform?

This code defines a PySpark transform function to copy raw files from an input dataset to an output dataset. It uses the 'shutil' library to copy the file bytes and allows for copying all files or only a subset based on provided regex patterns.

Copied!
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 from transforms.api import transform, Input, Output from pyspark.sql import DataFrame from functools import reduce import shutil # Copy raw files from the input dataset to the output dataset in a python transform @transform( my_output=Output("my_output_dataset"), my_input=Input("my_input_dataset") ) def copy_my_input(my_output, my_input): copy_raw_files(my_output, my_input, [".*\.csv"], False) def copy_raw_files(my_output, my_input, regexes, copy_full=False): # Copies the raw files def copy_file(file_status): # Open the given file in the input dataframe filesystem with my_input.filesystem().open(file_status.path, 'rb') as in_f: # Open a file in the output dataframe filesystem with my_output.filesystem().open(file_status.path, 'wb') as out_f: # Copy the file bytes from intput to output shutil.copyfileobj(in_f, out_f) # Choose whether to copy all the files or only a subset if copy_full: files_df = my_input.filesystem().files() else: files_to_copy = [] for regex in regexes: # Only copy files that match the regex files_to_copy.append(my_input.filesystem().files(regex=regex)) # Create one dataframe with all the files files_df = reduce(DataFrame.unionByName, files_to_copy) # This will parallelise the copy operation files_df.rdd.foreach(copy_file)
  • Date submitted: 2024-03-20
  • Tags: code authoring, code repositories, python

File processing

How do I process multiple files in a dataset using PySpark?

This code uses PySpark to process multiple files in a dataset, including gzipped files, by reading the first line of each file and creating a dataframe with the file information and the first line content.

Copied!
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 from transforms.api import transform, Input, Output, incremental from pyspark.sql import types as T from pyspark.sql import functions as F from pyspark.sql import Row import gzip import io # @incremental decorator or not (compatible with both) # Changed @transform_df for @transform # This gives more control over inputs and outputs and is needed to access the "file" version of the input dataset(s) @transform( output_dataset_1=Output(""), output_dataset_2=Output(""), input_dataset=Input("") ) def example_transform_file_processing(ctx, input_dataset, output_dataset_1, output_dataset_2): # The "files()" method returns a dataframe representing the filesystem of the input dataset fs = input_dataset.filesystem() files_df = input_dataset.filesystem().files() # Here you can extract the paths of each file, and then process them in any way you need or want # ==== Example of computation # Defines the schema of the rdd flatmap output schema = T.StructType([ T.StructField('hadoop_path', T.StringType()), T.StructField('file_name', T.StringType()), T.StructField('size', T.LongType()), T.StructField('modified', T.LongType()), T.StructField('first_row_content', T.StringType()) ]) cols = schema.fieldNames() # equivalent to :["hadoop_path", "file_name", "size", "modified"] MyRow = Row(*cols) # defining of the "MyRow" object to use as a return type of the RDD UDF-like function # Inline function to parse one file (idea: like a UDF, but for RDD) def process_file(file_status): # Example of processing : read the first line of each file line = "default value" try: line = "WARNING: Not supported file type." if file_status.path.endswith('.gz'): # Handle Gzipped files with fs.open(file_status.path, "rb") as f: gz = gzip.GzipFile(fileobj=f) br = io.BufferedReader(gz) tw = io.TextIOWrapper(br) line = tw.readline() else: with fs.open(file_status.path, "r") as f: line = f.readline() except Exception as e: line = "ERROR: " + str(e) # It creates a row out of the RDD element yield MyRow(fs.hadoop_path, file_status.path, file_status.size, file_status.modified, line) # Convert the files dataframe to a RDD. See https://spark.apache.org/docs/latest/rdd-programming-guide.html rdd = files_df.rdd # Apply a function on each element of the RDD rdd = rdd.flatMap(process_file) # Convert the RDD into a dataframe, to be write it to output easily # Specifying the schema allows to handle empty rdd. output_df = ctx.spark_session.createDataFrame(rdd, schema) # Add a timestamp output_df = output_df.withColumn('processed_at', F.current_timestamp()) # ==== End Example of computation # Writes to the output the filesystem's dataframe representation of the input output_dataset_1.write_dataframe(files_df) # Write the processed dataframe to the output output_dataset_2.write_dataframe(output_df)
  • Date submitted: 2024-03-20
  • Tags: code authoring, code repositories, python, gzip, zip

Load ORC file using PySpark

How do I load an ORC file using PySpark?

This code reads a raw ORC file from a the Hadoop path of an input dataset and writes the resulting spark dataframe to an output.

Copied!
1 2 3 4 5 6 7 8 9 10 11 from transforms.api import transform, Input, Output @transform( out=Output("output"), raw=Input("input"), ) def compute(ctx, out, raw): hadoop_path = raw.filesystem().hadoop_path df = ctx.spark_session.read.format('orc').load(f'{hadoop_path}/') out.write_dataframe(df)
  • Date submitted: 2024-07-18
  • Tags: pyspark, dataframe, orc, hadoop

Parse SAS files using PySpark

How do I create a PySpark dataframe from SAS datasets?

This code defines a transform function that takes an input dataset containing raw SAS files and creates a PySpark dataframe from them. It uses the spark-sas7bdat package to read the SAS files and load them into a dataframe.

Copied!
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 @transform( output=Output("xxxxx"), # include foundry RID here input_df=Input("xxxxx") # include foundry RID here ) def parse_sas_file(ctx, input_df, output, sas_path="*.sas7bdat"): ''' Creates a PySpark dataframe from SAS datasets Note that this function performs computation in the driver, and may require an increase in driver memory ctx: Spark context input_df: Input dataset containing raw SAS files sas_path: Path to SAS file within dataset, defaults to all SAS files in the dataset include_filename_as_field: Include the filename as a column for parsing downstream; defaults to false ''' fs = input_df.filesystem() hadoop_path = fs.hadoop_path files_df = fs.files(sas_path) # dfs = [] spark_session = ctx.spark_session.builder.appName(ctx.spark_session.sparkContext.appName).config('spark.jars.packages', 'saurfang:spark-sas7bdat:3.0.0-s_2.12').getOrCreate() # TODO: Update this to work for multiple paths # read in whatever files from the backing dataset path = files_df.collect()[0].path full_path = f'{hadoop_path}/{path}' df = spark_session.read.format("com.github.saurfang.sas.spark").load(full_path) output.write_dataframe(df)
  • Date submitted: 2024-07-29
  • Tags: pyspark, dataframe, sas, code repositories

Process and combine multiple files

How do I process multiple files in a dataset and combine them into a single PySpark DataFrame?

This code defines a PySpark transform that takes an input dataset containing multiple files, processes each file individually, and combines the results into a single PySpark DataFrame. It uses the 'map' function to apply the 'parse_file' function to each file in the dataset, collects the results, and unions all the DataFrames together.

Copied!
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 from pyspark.sql import functions as F, types as T, DataFrame from transforms.api import transform, Input, Output, configure import tempfile import shutil import functools # Define the PySpark transform @transform( processed_files=Output("example_processed_files_dataset"), file_dataset=Input("example_file_dataset"), ) def compute(ctx, processed_files, file_dataset): # Function to parse a single file def parse_file(file_status): with file_dataset.filesystem().open(file_status.path, "rb") as in_file: with tempfile.NamedTemporaryFile() as tmp_file: shutil.copyfileobj(in_file, tmp_file) tmp_file.flush() # Process the file locally and return a Python object return process_file_locally_and_return_python_object(tmp_file) # Get the list of files in the dataset files_df = file_dataset.filesystem().files() # Parse each file and collect the results parsed_files = files_df.rdd.map(parse_file).collect() dfs = [] for parsed_file in parsed_files: # Convert the parsed file to a PySpark DataFrame dfs.append(convert_to_df(ctx, parsed_file)) # Union all the DataFrames together df = functools.reduce(lambda df1, df2: df1.unionByName(df2, allowMissingColumns=True), dfs) # Write the resulting DataFrame to the output dataset processed_files.write_dataframe(df)
  • Date submitted: 2024-03-20
  • Tags: code authoring, code repositories, python, raw files, unstructured

Extracting content from DOCX files

How do I extract content from a DOCX file using Python?

This code uses the python-docx library to read the content of a DOCX file from a dataset and store it in a Document object for further processing.

Copied!
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 from transforms.api import transform, Input, Output import docx as dx from io import BytesIO @transform( output=Output("output_dataset"), docs=Input("input_dataset"), ) def compute(ctx, docs, output): fs = docs.filesystem() doc_file = list(fs.ls(regex=r'.*\.docx'))[0] # Open the file with the filesystem and read its content into a BytesIO object with fs.open(doc_file.path, 'rb') as f: source_stream = BytesIO(f.read()) document = dx.Document(source_stream) source_stream.close() # Do something with the document object
  • Date submitted: 2024-03-20
  • Tags: code authoring, python, python-docx, bytesio, raw files, unstructured

Process zipped CSV

How do I read and process a zip file containing multiple CSV files from an input dataset and write the processed data to an output dataset in PySpark?

This code uses PySpark to read and process zip files containing CSVs in an input dataset, skipping the first line of each CSV, and writes the processed data to an output dataset

Copied!
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 from pyspark.sql import functions as F, types as T, DataFrame from transforms.api import transform, Input, Output, configure import shutil import tempfile import zipfile import io @transform( my_output=Output("my_output_dataset"), my_input=Input("my_input_dataset") ) def compute(ctx, my_output, my_input): # Function to process each file in the input dataset def process_file(file_status): with fs.open(file_status.path, 'rb') as f: with tempfile.NamedTemporaryFile() as tmp: shutil.copyfileobj(f, tmp) tmp.flush() # Read and process the zip file with zipfile.ZipFile(tmp) as archive: for filename in archive.namelist(): with archive.open(filename) as f2: br = io.BufferedReader(f2) tw = io.TextIOWrapper(br) tw.readline() # Skip the first line of each CSV # Read and process each line in the CSV for line in tw: yield MyRow(*line.split(",")) # Read input dataset and process each file rdd = my_input.files().rdd rdd = rdd.flatMap(process_file) df = rdd.toDF() # Write the processed data to the output dataset my_output.write_dataframe(df)
  • Date submitted: 2024-03-20
  • Tags: code authoring, code repositories, python, zip, csv

Unzipping and extracting files in dataset

How can I unzip a file in a dataset?

This code uses PySpark to read zipped files from an input, extract the contents, and write the extracted files to an output. It does this by iterating through the zipped files, reading their contents into a BytesIO stream, and then using the zipfile library to extract the files to a temporary directory. The extracted files are then written to the output.

Copied!
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 # from pyspark.sql import functions as F from transforms.api import transform, Input, Output import zipfile import tempfile import os from io import BytesIO @transform( unzipped=Output(""), zipped=Input(""), ) def compute(unzipped, zipped): zip_files = zipped.filesystem().files(glob="*.zip").collect() for zip_file in zip_files: with zipped.filesystem().open(zip_file["path"], 'rb') as zip_f: source_stream = BytesIO(zip_f.read()) with zipfile.ZipFile(source_stream, 'r') as zip_ref: with tempfile.TemporaryDirectory() as temp_dir: zip_ref.extractall(temp_dir) for path in iterate_directories(temp_dir): output_file_name = path.replace(temp_dir, "") with unzipped.filesystem().open(output_file_name, "w") as out_f: with open(path, 'r') as in_f: out_f.write(in_f.read()) def iterate_directories(directory): for root, dirs, files in os.walk(directory): for file in files: path = os.path.join(root, file) if is_leaf_file(path): yield path def is_leaf_file(path): return os.path.isfile(path) and not os.path.islink(path)
  • Date submitted: 2024-03-20
  • Tags: code authoring, code repositories, python, raw files, zip, unzip

Zip dataset files

How can I create a zip file from a dataset of files?

This code uses the transforms API to read all Markdown files from a source dataset and create a zip file containing these files.

Copied!
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 from transforms.api import transform, Input, Output import zipfile @transform( my_output=Output(""), source_df=Input(""), ) def compute(ctx, my_output, source_df): files = source_df.filesystem().files(glob="*.md").collect() with my_output.filesystem().open("foundry_code_examples.zip", 'wb') as write_zip: with zipfile.ZipFile(write_zip.name, 'w') as zip_file: for file_row in files: with source_df.filesystem().open(file_row["path"], 'rb') as markdown_file: zip_file.write(markdown_file.name, arcname=file_row["path"]) return source_df
  • Date submitted: 2024-03-26
  • Tags: raw files, zip, python, code authoring, code repositories, export

Java

Parse Excel files with complex, multi-row headers

How can I parse Excel files where the header is complex and consists of multiple rows?

This code demonstrates how to parse Excel files with complex headers using the transforms-excel-parser library. It creates a TableParser with a MultilayerMergedHeaderExtractor, then creates a TransformsExcelParser with the TableParser. Finally, it uses the TransformsExcelParser to extract data from the Excel files in the input dataset and writes the result to the output.

Copied!
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 package myproject.datasets; import com.palantir.transforms.excel.ParseResult; import com.palantir.transforms.excel.Parser; import com.palantir.transforms.excel.TransformsExcelParser; import com.palantir.transforms.excel.table.MultilayerMergedHeaderExtractor; import com.palantir.transforms.excel.table.TableParser; import com.palantir.transforms.lang.java.api.Compute; import com.palantir.transforms.lang.java.api.FoundryInput; import com.palantir.transforms.lang.java.api.FoundryOutput; import com.palantir.transforms.lang.java.api.Input; import com.palantir.transforms.lang.java.api.Output; import java.util.Optional; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; public final class ComplexHeaderExcel { @Compute public void myComputeFunction( @Input("<input_dataset_rid>") FoundryInput myInput, @Output("<output_dataset_rid>") FoundryOutput myOutput, @Output("<error_output_dataset_rid>") FoundryOutput errorOutput ) { // Create a TableParser with a MultilayerMergedHeaderExtractor Parser tableParser = TableParser.builder() .headerExtractor(MultilayerMergedHeaderExtractor.builder() .topLeftCellName("A1") .bottomRightCellName("D2") .build()) .build(); // Create a TransformsExcelParser with the TableParser TransformsExcelParser transformsParser = TransformsExcelParser.of(tableParser); // Parse input ParseResult result = transformsParser.parse(myInput.asFiles().getFileSystem().filesAsDataset()); // Get the parsed data, which may be empty if there were no rows in the input // or an error occurred Optional<Dataset<Row>> maybeDf = result.singleResult(); // If parsed data is not empty, write it to the output dataset maybeDf.ifPresent(df -> myOutput.getDataFrameWriter(df).write()); // Write error information to the error output errorOutput.getDataFrameWriter(result.errorDataframe()).write(); } }
  • Date submitted: 2024-08-08
  • Tags: code authoring, code repositories, java, transforms-excel-parser, excel

Parse Excel files with non-tabular (form) data

How do I parse Excel files where the data is not tabular?

This code demonstrates how to use the transforms-excel-parser library to extract data from Excel files containing forms across multiple sheets.

Copied!
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 package myproject.datasets; import com.palantir.transforms.excel.TransformsExcelParser; import com.palantir.transforms.excel.ParseResult; import com.palantir.transforms.excel.Parser; import com.palantir.transforms.excel.form.FieldSpec; import com.palantir.transforms.excel.form.FormParser; import com.palantir.transforms.excel.form.Location; import com.palantir.transforms.excel.form.cellvalue.AdjacentCellAssertion; import com.palantir.transforms.excel.form.cellvalue.CellValue; import com.palantir.transforms.excel.functions.RegexSubstringMatchingSheetSelector; import com.palantir.transforms.lang.java.api.Compute; import com.palantir.transforms.lang.java.api.FoundryInput; import com.palantir.transforms.lang.java.api.FoundryOutput; import com.palantir.transforms.lang.java.api.Input; import com.palantir.transforms.lang.java.api.Output; public final class FormStyleExcel { private static final String FORM_A_KEY = "FORM_A"; private static final String FORM_B_KEY = "FORM_B"; @Compute public void myComputeFunction( @Input("<input_dataset_rid") FoundryInput myInput, @Output("<form_a_output_dataset_rid>") FoundryOutput formAOutput, @Output("<form_b_output_dataset_rid>") FoundryOutput formBOutput, @Output("<error_output_dataset_rid>") FoundryOutput errorOutput) { // Form A parser configuration Parser formAParser = FormParser.builder() .sheetSelector(new RegexSubstringMatchingSheetSelector("Form_A")) .addFieldSpecs(createFieldSpec("form_a_field_1", "B1")) .addFieldSpecs(createFieldSpec("form_a_field_2", "B2")) .build(); // Form B parser configuration Parser formBParser = FormParser.builder() .sheetSelector(new RegexSubstringMatchingSheetSelector("Form_B")) .sheetSelector(new RegexSubstringMatchingSheetSelector("Form_B")) .addFieldSpecs(createFieldSpec("form_b_field_1", "B1")) .addFieldSpecs(createFieldSpec("form_b_field_2", "B2")) .build(); // TransformsExcelParser with both Form A and Form B parsers TransformsExcelParser transformsParser = TransformsExcelParser.builder() .putKeyToParser(FORM_A_KEY, formAParser) .putKeyToParser(FORM_B_KEY, formBParser) .build(); // Parse input ParseResult result = transformsParser.parse(myInput.asFiles().getFileSystem().filesAsDataset()); // Write parsed data to the output datasets result.dataframeForKey(FORM_A_KEY) .ifPresent(df -> formAOutput.getDataFrameWriter(df).write()); result.dataframeForKey(FORM_B_KEY) .ifPresent(df -> formBOutput.getDataFrameWriter(df).write()); // Write error information to the error output errorOutput.getDataFrameWriter(result.errorDataframe()).write(); } // Helper method to concisely create a FieldSpec with an appropriate assertion private static FieldSpec createFieldSpec(String fieldName, String cellLocation) { return FieldSpec.of( fieldName, CellValue.builder() .addAssertions(AdjacentCellAssertion.left(1, fieldName)) .location(Location.of(cellLocation)) .build()); } }
  • Date submitted: 2024-08-06
  • Tags: code authoring, code repositories, java, transforms-excel-parser, excel

Parse simple tabular Excel files

How do I parse simple tabular Excel files using Transforms Excel Parser?

This code demonstrates how to parse a dataset containing simple tabular Excel files using the transforms-excel-parser library. It creates a TableParser with a SimpleHeaderExtractor, then creates a TransformsExcelParser with the TableParser. Finally, it uses the TransformsExcelParser to parse the files in the input dataset and writes the extracted data to the output dataset.

Copied!
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 package myproject.datasets; import com.palantir.transforms.excel.ParseResult; import com.palantir.transforms.excel.Parser; import com.palantir.transforms.excel.TransformsExcelParser; import com.palantir.transforms.excel.table.SimpleHeaderExtractor; import com.palantir.transforms.excel.table.TableParser; import com.palantir.transforms.lang.java.api.Compute; import com.palantir.transforms.lang.java.api.FoundryInput; import com.palantir.transforms.lang.java.api.FoundryOutput; import com.palantir.transforms.lang.java.api.Input; import com.palantir.transforms.lang.java.api.Output; import java.util.Optional; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; public final class SimpleTabularExcel { @Compute public void myComputeFunction( @Input("<input_dataset_rid>") FoundryInput myInput, @Output("<output_dataset_rid>") FoundryOutput myOutput, @Output("<error_output_dataset_rid>") FoundryOutput errorOutput ) { // Create a TableParser with an appropriately configured SimpleHeaderExtractor // In this example, the header of the file is on the second row. // If the header were on the first row, we would not need to // specify rowsToSkip, since the default value is 0, and in fact // we could just do TableParser.builder().build() in that case. Parser tableParser = TableParser.builder() .headerExtractor( SimpleHeaderExtractor.builder().rowsToSkip(1).build()) .build(); // Create a TransformsExcelParser with the TableParser TransformsExcelParser transformsParser = TransformsExcelParser.of(tableParser); // Parse input ParseResult result = transformsParser.parse(myInput.asFiles().getFileSystem().filesAsDataset()); // Get the parsed data, which may be empty if there were no rows in the input // or an error occurred Optional<Dataset<Row>> maybeDf = result.singleResult(); // If parsed data is not empty, write it to the output dataset maybeDf.ifPresent(df -> myOutput.getDataFrameWriter(df).write()); // Write error information to the error output errorOutput.getDataFrameWriter(result.errorDataframe()).write(); }
  • Date submitted: 2024-08-08
  • Tags: code authoring, code repositories, java, transforms-excel-parser, excel