-
Notifications
You must be signed in to change notification settings - Fork 44
fix(cdk): upgrade unstructured from 0.10.27 to 0.18.32 #908
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
31b3ab1
f3ac153
4b907be
eb167ef
74612b8
c7fed0f
400b278
6ca00a7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -6,19 +6,13 @@ | |||||||||
| import traceback | ||||||||||
| from datetime import datetime | ||||||||||
| from io import BytesIO, IOBase | ||||||||||
| from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union | ||||||||||
| from typing import IO, Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast | ||||||||||
|
|
||||||||||
| import backoff | ||||||||||
| import dpath | ||||||||||
| import nltk | ||||||||||
| import requests | ||||||||||
| from unstructured.file_utils.filetype import ( | ||||||||||
| EXT_TO_FILETYPE, | ||||||||||
| FILETYPE_TO_MIMETYPE, | ||||||||||
| STR_TO_FILETYPE, | ||||||||||
| FileType, | ||||||||||
| detect_filetype, | ||||||||||
| ) | ||||||||||
| from unstructured.file_utils.filetype import FileType, detect_filetype | ||||||||||
|
|
||||||||||
| from airbyte_cdk.models import FailureType | ||||||||||
| from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig | ||||||||||
|
|
@@ -85,14 +79,21 @@ def _import_unstructured() -> None: | |||||||||
| global unstructured_partition_docx | ||||||||||
| global unstructured_partition_pptx | ||||||||||
| from unstructured.partition.docx import partition_docx | ||||||||||
| from unstructured.partition.pdf import partition_pdf | ||||||||||
| from unstructured.partition.pptx import partition_pptx | ||||||||||
|
|
||||||||||
| # separate global variables to properly propagate typing | ||||||||||
| unstructured_partition_pdf = partition_pdf | ||||||||||
| unstructured_partition_docx = partition_docx | ||||||||||
| unstructured_partition_pptx = partition_pptx | ||||||||||
|
|
||||||||||
| try: | ||||||||||
| from unstructured.partition.pdf import partition_pdf | ||||||||||
|
|
||||||||||
| unstructured_partition_pdf = partition_pdf | ||||||||||
| except ImportError: | ||||||||||
| logger = logging.getLogger(__name__) | ||||||||||
| logger.info( | ||||||||||
| "Could not import unstructured.partition.pdf (requires unstructured_inference). PDF parsing will be unavailable." | ||||||||||
| ) | ||||||||||
|
|
||||||||||
|
|
||||||||||
| def user_error(e: Exception) -> bool: | ||||||||||
| """ | ||||||||||
|
|
@@ -207,13 +208,6 @@ def _read_file( | |||||||||
| logger: logging.Logger, | ||||||||||
| ) -> str: | ||||||||||
| _import_unstructured() | ||||||||||
| if ( | ||||||||||
| (not unstructured_partition_pdf) | ||||||||||
| or (not unstructured_partition_docx) | ||||||||||
| or (not unstructured_partition_pptx) | ||||||||||
| ): | ||||||||||
| # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point) | ||||||||||
| raise Exception("unstructured library is not available") | ||||||||||
|
|
||||||||||
| filetype: FileType | None = self._get_filetype(file_handle, remote_file) | ||||||||||
|
|
||||||||||
|
|
@@ -335,7 +329,7 @@ def _read_file_remotely( | |||||||||
|
|
||||||||||
| data = self._params_to_dict(format.parameters, strategy) | ||||||||||
|
|
||||||||||
| file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])} | ||||||||||
| file_data = {"files": ("filename", file_handle, filetype.mime_type)} | ||||||||||
|
|
||||||||||
| response = requests.post( | ||||||||||
| f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data | ||||||||||
|
|
@@ -356,32 +350,38 @@ def _read_file_locally( | |||||||||
| self, file_handle: IOBase, filetype: FileType, strategy: str, remote_file: RemoteFile | ||||||||||
| ) -> str: | ||||||||||
| _import_unstructured() | ||||||||||
| if ( | ||||||||||
| (not unstructured_partition_pdf) | ||||||||||
| or (not unstructured_partition_docx) | ||||||||||
| or (not unstructured_partition_pptx) | ||||||||||
| ): | ||||||||||
| # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point) | ||||||||||
| raise Exception("unstructured library is not available") | ||||||||||
|
|
||||||||||
| file: Any = file_handle | ||||||||||
|
|
||||||||||
| # before the parsing logic is entered, the file is read completely to make sure it is in local memory | ||||||||||
| file_handle.seek(0) | ||||||||||
| file_handle.read() | ||||||||||
| file_handle.seek(0) | ||||||||||
|
|
||||||||||
| try: | ||||||||||
| if filetype == FileType.PDF: | ||||||||||
| # for PDF, read the file into a BytesIO object because some code paths in pdf parsing are doing an instance check on the file object and don't work with file-like objects | ||||||||||
| if not unstructured_partition_pdf: | ||||||||||
| raise self._create_parse_error( | ||||||||||
| remote_file, | ||||||||||
| "PDF parsing requires the 'unstructured_inference' package. Install it with: pip install unstructured-inference", | ||||||||||
| ) | ||||||||||
| file_handle.seek(0) | ||||||||||
| with BytesIO(file_handle.read()) as file: | ||||||||||
| file_handle.seek(0) | ||||||||||
| elements = unstructured_partition_pdf(file=file, strategy=strategy) | ||||||||||
| elif filetype == FileType.DOCX: | ||||||||||
| if not unstructured_partition_docx: | ||||||||||
| raise self._create_parse_error( | ||||||||||
| remote_file, "DOCX partition function is not available" | ||||||||||
| ) | ||||||||||
| elements = unstructured_partition_docx(file=file) | ||||||||||
| elif filetype == FileType.PPTX: | ||||||||||
| if not unstructured_partition_pptx: | ||||||||||
| raise self._create_parse_error( | ||||||||||
| remote_file, "PPTX partition function is not available" | ||||||||||
| ) | ||||||||||
| elements = unstructured_partition_pptx(file=file) | ||||||||||
| except RecordParseError: | ||||||||||
| raise | ||||||||||
| except Exception as e: | ||||||||||
| raise self._create_parse_error(remote_file, str(e)) | ||||||||||
|
|
||||||||||
|
|
@@ -405,8 +405,10 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT | |||||||||
| 2. Use the file name if available | ||||||||||
| 3. Use the file content | ||||||||||
| """ | ||||||||||
| if remote_file.mime_type and remote_file.mime_type in STR_TO_FILETYPE: | ||||||||||
| return STR_TO_FILETYPE[remote_file.mime_type] | ||||||||||
| if remote_file.mime_type: | ||||||||||
| ft = FileType.from_mime_type(remote_file.mime_type) | ||||||||||
| if ft is not None and ft != FileType.UNK: | ||||||||||
| return ft | ||||||||||
|
|
||||||||||
|
rwask marked this conversation as resolved.
|
||||||||||
| # set name to none, otherwise unstructured will try to get the modified date from the local file system | ||||||||||
| if hasattr(file, "name"): | ||||||||||
|
|
@@ -418,7 +420,7 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT | |||||||||
| file_type: FileType | None = None | ||||||||||
| try: | ||||||||||
| file_type = detect_filetype( | ||||||||||
| filename=remote_file.uri, | ||||||||||
| file_path=remote_file.uri, | ||||||||||
| ) | ||||||||||
| except Exception: | ||||||||||
| # Path doesn't exist locally. Try something else... | ||||||||||
|
|
@@ -427,16 +429,17 @@ def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileT | |||||||||
| if file_type and file_type != FileType.UNK: | ||||||||||
| return file_type | ||||||||||
|
|
||||||||||
| type_based_on_content = detect_filetype(file=file) | ||||||||||
| file.seek(0) # detect_filetype is reading to read the file content, so we need to reset | ||||||||||
| extension = "." + remote_file.uri.split(".")[-1].lower() | ||||||||||
| ext_type = FileType.from_extension(extension) | ||||||||||
| if ext_type is not None: | ||||||||||
| return ext_type | ||||||||||
|
rwask marked this conversation as resolved.
Comment on lines
+434
to
+435
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 Missing In
Suggested change
Was this helpful? React with 👍 or 👎 to provide feedback. |
||||||||||
|
|
||||||||||
| type_based_on_content = detect_filetype(file=cast(IO[bytes], file)) | ||||||||||
| file.seek(0) | ||||||||||
|
|
||||||||||
| if type_based_on_content and type_based_on_content != FileType.UNK: | ||||||||||
| return type_based_on_content | ||||||||||
|
|
||||||||||
| extension = "." + remote_file.uri.split(".")[-1].lower() | ||||||||||
| if extension in EXT_TO_FILETYPE: | ||||||||||
| return EXT_TO_FILETYPE[extension] | ||||||||||
|
|
||||||||||
| return None | ||||||||||
|
|
||||||||||
| def _supported_file_types(self) -> List[Any]: | ||||||||||
|
|
||||||||||
Uh oh!
There was an error while loading. Please reload this page.