Document-to-text transformations provide a flexible and performant way to extract document content in multiple formats, including the ability to extract document layout structure, such as paragraphs, headers, and tables.
AIP Document Intelligence provides two document-to-text media transformation operations:
extractTextV2: Returns a list of strings containing extracted document text.extractLayoutAwareTextV2: Returns a list of layout-aware text blocks across pages.We recommend using the operations listed above, which replace the extractLayoutAwareContent and ocrOnPage operations.
You can reference an example operation signature below:
Copied!1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24{ "type": "documentToText", "documentToText": { "operation": { "type": "{operation}", // {operation}: "extractTextV2" or "extractLayoutAwareTextV2" "{operation}": { "pageRange": { "startPageInclusive": 0, "endPageExclusive": 5 }, "config": { "mode": "SCAN", // or "ELECTRONIC" or "AUTO" "format": "TEXT", // or "MARKDOWN" or "HTML" "languages": [ { "type": "language", "language": "KOR" } ] } } } } }
config.mode: Controls how the document pages are interpreted.
ELECTRONIC: Treats all pages as electronic PDF files and extracts embedded/raw text. Use ELECTRONIC when you know your documents contain embedded text.SCAN: Treats all pages as scanned images and performs Optical Character Recognition (OCR). Use SCAN when you know your documents are scanned images.AUTO: Automatically decides per-page whether OCR is needed. Use only when you do not know ahead of time whether the PDF files are electronic, scanned, or mixed. AUTO mode runs with a small compute overhead.config.format: Controls the output format.
TEXTMARKDOWNHTMLconfig.languages: Controls the languages to be detected, which is used for OCR in SCAN or AUTO mode.
language: The language code, such as KOR.type: Always set to "language".pageRange: Controls the pages to be extracted. In the operation signature above, the page range processes pages 0, 1, 2, 3, and 4. Use page ranges to improve build performance for larger documents. Instead of issuing one request per page, you can batch multiple pages together. A page range of roughly five to ten pages is an appropriate starting point, depending on document size, model limits, and rate limits.Reference the examples below to help you execute a document-to-text transformation using a Python transform or function.
Copied!1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94import polars as pl from concurrent.futures import ThreadPoolExecutor from transforms.api import Output, transform from transforms.mediasets import MediaSetInput from transforms.mediasets.utils._constants import MEDIA_ITEM_RID, MEDIA_REFERENCE, PATH THREAD_NUMBER = 20 # @incremental(v2_semantics=True) # Uncomment this line if incremental is needed. @transform.using( output=Output(OUTPUT_DATASET_RID), media_input=MediaSetInput(INPUT_MEDIA_SET_RID), ) def extract(media_input, output): media_refs = pl.from_pandas( media_input.list_media_items_by_path_with_media_reference().pandas(), schema_overrides={ MEDIA_ITEM_RID: pl.String, MEDIA_REFERENCE: pl.String, PATH: pl.String, }, ) def process_batch(batch_df: pl.DataFrame) -> pl.DataFrame: def create_page_tasks(row): media_item_rid = row[MEDIA_ITEM_RID] metadata = media_input.get_media_item_metadata(media_item_rid).document if metadata is None: raise ValueError(f"Media item {media_item_rid} is not a document") if metadata.pages is None: raise ValueError(f"Media item {media_item_rid} has no page count") return [(row, page_num) for page_num in range(metadata.pages)] def process_single_page(task): row, page_num = task media_item_rid = row[MEDIA_ITEM_RID] media_reference = row[MEDIA_REFERENCE] extraction_result = media_input.transform_media_item( media_item_rid, str(page_num), { "type": "documentToText", "documentToText": { "operation": { "type": "extractLayoutAwareTextV2", "extractLayoutAwareTextV2": { "pageRange": { "startPageInclusive": page_num, "endPageExclusive": page_num + 1, }, "config": { "mode": "ELECTRONIC", "format": "TEXT", }, }, }, }, }, ) return { "media_item_rid": media_item_rid, "media_reference": media_reference, "page_num": page_num, "extraction_result": str(extraction_result.json()), } all_tasks = [] for row in batch_df.iter_rows(named=True): all_tasks.extend(create_page_tasks(row)) with ThreadPoolExecutor(max_workers=THREAD_NUMBER) as executor: results = list(executor.map(process_single_page, all_tasks)) return pl.DataFrame(results) extracted_data = media_refs.lazy().map_batches( process_batch, schema={ "media_item_rid": pl.String, "media_reference": pl.String, "page_num": pl.Int64, "extraction_result": pl.String, }, streamable=True, ) output.write_dataframe(extracted_data)
Copied!1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79from time import sleep from foundry_sdk import FoundryClient from foundry_sdk.v2.media_sets import models from functions.api import function #### Helper functions def _create_transform_job( media_set_rid: str, media_item_rid: str, transformation: models.DocumentToTextTransformation ) -> str: fc = FoundryClient() job_initiation_resp = fc.media_sets.MediaSet.transform( media_set_rid=media_set_rid, media_item_rid=media_item_rid, transformation=transformation, preview=True, ) job_id = job_initiation_resp.job_id return job_id def _is_transform_finished(media_set_rid: str, media_item_rid: str, job_id: str) -> bool: fc = FoundryClient() status = fc.media_sets.MediaSet.get_status(media_set_rid, media_item_rid, job_id, preview=True) return status.status in ("SUCCESSFUL", "FAILED") def _get_transform_result(media_set_rid: str, media_item_rid: str, job_id: str) -> str: fc = FoundryClient() result = fc.media_sets.MediaSet.get_result(media_set_rid, media_item_rid, job_id, preview=True) return result.decode("utf-8") def _run_transform_blocking( media_set_rid: str, media_item_rid: str, transformation: models.DocumentToTextTransformation ) -> str: job_id = _create_transform_job(media_set_rid, media_item_rid, transformation) while not _is_transform_finished(media_set_rid, media_item_rid, job_id): sleep(0.5) return _get_transform_result(media_set_rid, media_item_rid, job_id) # Use this function if your input is a media set # We suggest running the function across batches of five to ten pages to avoid timeout @function(beta=True) def transform_vlm(media_set_rid: str, media_item_rid: str, start_page_inclusive: int, end_page_exclusive: int) -> str: LANGUAGES: list[models.OcrLanguageOrScript] = [models.OcrLanguageWrapper(language="ENG")] return _run_transform_blocking( media_set_rid, media_item_rid, models.DocumentToTextTransformation( operation=models.ExtractDocumentLayoutAwareTextV2Operation( page_range=models.PageRange(start_page_inclusive=start_page_inclusive, end_page_exclusive=end_page_exclusive), config=models.ExtractDocumentLayoutAwareTextV2Config(languages=LANGUAGES), ) ), ) # Use this function if your input is an object # We suggest running the function across batches of five to ten pages to avoid timeout @function(beta=True, edits=[<YOUR_OBJECT_TYPE>]) def transform_vlm_object(myObject: <YOUR_OBJECT_TYPE>, start_page_inclusive: int, end_page_exclusive: int) -> str: reference_view = myObject.media_reference.get_media_reference().reference.media_set_view_item media_set_rid = reference_view.media_set_rid media_item_rid = reference_view.media_item_rid LANGUAGES: list[models.OcrLanguageOrScript] = [models.OcrLanguageWrapper(language="ENG")] return _run_transform_blocking( media_set_rid, media_item_rid, models.DocumentToTextTransformation( operation=models.ExtractDocumentLayoutAwareTextV2Operation( page_range=models.PageRange(start_page_inclusive=start_page_inclusive, end_page_exclusive=end_page_exclusive), config=models.ExtractDocumentLayoutAwareTextV2Config(languages=LANGUAGES), ) ), )