Modules

Top-level package for Document Tools.

`LayoutLMv2Encoder` ¶

Bases: BaseEncoder

LayoutLMv2Encoder is the encoder for datasets using LayoutLMv2.

Source code in document_tools/encoders/encoders.py

class LayoutLMv2Encoder(BaseEncoder):
    """LayoutLMv2Encoder is the encoder for datasets using LayoutLMv2."""

    def __init__(self, **kwargs):
        """
        Initialize the LayoutLMv2Encoder.

        Parameters
        ----------
        kwargs : Dict[str, Any]
            Check the documentation of the LayoutLMv2Processor for the available parameters :
            https://huggingface.co/docs/transformers/model_doc/layoutlmv2#transformers.LayoutLMv2Processor
        """
        super().__init__(**kwargs)
        self.default_model = self.config.get("default_model", "microsoft/layoutlmv2-base-uncased")
        self.processor = LayoutLMv2Processor.from_pretrained(self.default_model, **self.config)
        self.features = Features(
            {
                "image": Array3D(dtype="int64", shape=(3, 224, 224)),
                "input_ids": Sequence(feature=Value(dtype="int64")),
                "attention_mask": Sequence(Value(dtype="int64")),
                "token_type_ids": Sequence(Value(dtype="int64")),
                "bbox": Array2D(dtype="int64", shape=(512, 4)),
                "labels": Sequence(ClassLabel(num_classes=len(self.labels), names=self.labels)),
            }
        )

    def __call__(self, batch: Dict[str, List]):
        """Call the LayoutLMv2Encoder."""
        images = [image.convert("RGB") for image in batch["image"]]
        encoded_inputs = self.processor(images)
        encoded_inputs["labels"] = [label for label in batch["label"]]
        return encoded_inputs

`call(batch)` ¶

Call the LayoutLMv2Encoder.

Source code in document_tools/encoders/encoders.py

def __call__(self, batch: Dict[str, List]):
    """Call the LayoutLMv2Encoder."""
    images = [image.convert("RGB") for image in batch["image"]]
    encoded_inputs = self.processor(images)
    encoded_inputs["labels"] = [label for label in batch["label"]]
    return encoded_inputs

`init(**kwargs)` ¶

Initialize the LayoutLMv2Encoder.

Parameters¶

Dict[str, Any]

Check the documentation of the LayoutLMv2Processor for the available parameters : https://huggingface.co/docs/transformers/model_doc/layoutlmv2#transformers.LayoutLMv2Processor

Source code in document_tools/encoders/encoders.py

def __init__(self, **kwargs):
    """
    Initialize the LayoutLMv2Encoder.

    Parameters
    ----------
    kwargs : Dict[str, Any]
        Check the documentation of the LayoutLMv2Processor for the available parameters :
        https://huggingface.co/docs/transformers/model_doc/layoutlmv2#transformers.LayoutLMv2Processor
    """
    super().__init__(**kwargs)
    self.default_model = self.config.get("default_model", "microsoft/layoutlmv2-base-uncased")
    self.processor = LayoutLMv2Processor.from_pretrained(self.default_model, **self.config)
    self.features = Features(
        {
            "image": Array3D(dtype="int64", shape=(3, 224, 224)),
            "input_ids": Sequence(feature=Value(dtype="int64")),
            "attention_mask": Sequence(Value(dtype="int64")),
            "token_type_ids": Sequence(Value(dtype="int64")),
            "bbox": Array2D(dtype="int64", shape=(512, 4)),
            "labels": Sequence(ClassLabel(num_classes=len(self.labels), names=self.labels)),
        }
    )

`LayoutLMv3Encoder` ¶

Bases: BaseEncoder

LayoutLMv3Encoder is the encoder for datasets using LayoutLMv3.

Source code in document_tools/encoders/encoders.py

class LayoutLMv3Encoder(BaseEncoder):
    """LayoutLMv3Encoder is the encoder for datasets using LayoutLMv3."""

    def __init__(self, **kwargs):
        """
        Initialize the LayoutLMv3Encoder.

        Parameters
        ----------
        kwargs : Dict[str, Any]
            Check the documentation of the LayoutLMv3Processor for the available parameters :
            https://huggingface.co/docs/transformers/model_doc/layoutlmv3#transformers.LayoutLMv3Processor
        """
        super().__init__(**kwargs)
        self.default_model = self.config.get("default_model", "microsoft/layoutlmv3-base")
        self.processor = LayoutLMv3Processor.from_pretrained(self.default_model, **self.config)
        self.features = Features(
            {
                "pixel_values": Array3D(dtype="float32", shape=(3, 224, 224)),
                "input_ids": Sequence(feature=Value(dtype="int64")),
                "attention_mask": Sequence(Value(dtype="int64")),
                "bbox": Array2D(dtype="int64", shape=(512, 4)),
                "labels": Sequence(feature=Value(dtype='int64')),
            }
        )

    def __call__(self, batch: Dict[str, List]):
        """Call the LayoutLMv3Encoder."""
        images = [image.convert("RGB") for image in batch["image"]]
        encoded_inputs = self.processor(images)
        encoded_inputs["labels"] = [label for label in batch["label"]]
        return encoded_inputs

`call(batch)` ¶

Call the LayoutLMv3Encoder.

Source code in document_tools/encoders/encoders.py

def __call__(self, batch: Dict[str, List]):
    """Call the LayoutLMv3Encoder."""
    images = [image.convert("RGB") for image in batch["image"]]
    encoded_inputs = self.processor(images)
    encoded_inputs["labels"] = [label for label in batch["label"]]
    return encoded_inputs

`init(**kwargs)` ¶

Initialize the LayoutLMv3Encoder.

Parameters¶

Dict[str, Any]

Check the documentation of the LayoutLMv3Processor for the available parameters : https://huggingface.co/docs/transformers/model_doc/layoutlmv3#transformers.LayoutLMv3Processor

Source code in document_tools/encoders/encoders.py

def __init__(self, **kwargs):
    """
    Initialize the LayoutLMv3Encoder.

    Parameters
    ----------
    kwargs : Dict[str, Any]
        Check the documentation of the LayoutLMv3Processor for the available parameters :
        https://huggingface.co/docs/transformers/model_doc/layoutlmv3#transformers.LayoutLMv3Processor
    """
    super().__init__(**kwargs)
    self.default_model = self.config.get("default_model", "microsoft/layoutlmv3-base")
    self.processor = LayoutLMv3Processor.from_pretrained(self.default_model, **self.config)
    self.features = Features(
        {
            "pixel_values": Array3D(dtype="float32", shape=(3, 224, 224)),
            "input_ids": Sequence(feature=Value(dtype="int64")),
            "attention_mask": Sequence(Value(dtype="int64")),
            "bbox": Array2D(dtype="int64", shape=(512, 4)),
            "labels": Sequence(feature=Value(dtype='int64')),
        }
    )

`LayoutXLMEncoder` ¶

Bases: BaseEncoder

LayoutXLMEncoder is the encoder for datasets using LayoutXLM.

Source code in document_tools/encoders/encoders.py

class LayoutXLMEncoder(BaseEncoder):
    """LayoutXLMEncoder is the encoder for datasets using LayoutXLM."""

    def __init__(self, **kwargs):
        """
        Initialize the LayoutXLMEncoder.

        Parameters
        ----------
        kwargs : Dict[str, Any]
            Check the documentation of the LayoutXLMProcessor for the available parameters :
            https://huggingface.co/docs/transformers/model_doc/layoutxlm#transformers.LayoutXLMProcessor
        """
        super().__init__(**kwargs)
        self.default_model = self.config.get("default_model", "microsoft/layoutxlm-base")
        self.config["return_token_type_ids"] = True
        self.processor = LayoutXLMProcessor.from_pretrained(self.default_model, **self.config)
        self.features = Features(
            {
                "image": Array3D(dtype="int64", shape=(3, 224, 224)),
                "input_ids": Sequence(feature=Value(dtype="int64")),
                "attention_mask": Sequence(Value(dtype="int64")),
                # "token_type_ids": Sequence(Value(dtype="int64")),
                "bbox": Array2D(dtype="int64", shape=(512, 4)),
                "labels": Sequence(ClassLabel(num_classes=len(self.labels), names=self.labels)),
            }
        )

    def __call__(self, batch: Dict[str, List]):
        """Call the LayoutXLMEncoder."""
        images = [image.convert("RGB") for image in batch["image"]]
        encoded_inputs = self.processor(images)
        encoded_inputs["labels"] = [label for label in batch["label"]]
        return encoded_inputs

`call(batch)` ¶

Call the LayoutXLMEncoder.

Source code in document_tools/encoders/encoders.py

def __call__(self, batch: Dict[str, List]):
    """Call the LayoutXLMEncoder."""
    images = [image.convert("RGB") for image in batch["image"]]
    encoded_inputs = self.processor(images)
    encoded_inputs["labels"] = [label for label in batch["label"]]
    return encoded_inputs

`init(**kwargs)` ¶

Initialize the LayoutXLMEncoder.

Parameters¶

Dict[str, Any]

Check the documentation of the LayoutXLMProcessor for the available parameters : https://huggingface.co/docs/transformers/model_doc/layoutxlm#transformers.LayoutXLMProcessor

Source code in document_tools/encoders/encoders.py

def __init__(self, **kwargs):
    """
    Initialize the LayoutXLMEncoder.

    Parameters
    ----------
    kwargs : Dict[str, Any]
        Check the documentation of the LayoutXLMProcessor for the available parameters :
        https://huggingface.co/docs/transformers/model_doc/layoutxlm#transformers.LayoutXLMProcessor
    """
    super().__init__(**kwargs)
    self.default_model = self.config.get("default_model", "microsoft/layoutxlm-base")
    self.config["return_token_type_ids"] = True
    self.processor = LayoutXLMProcessor.from_pretrained(self.default_model, **self.config)
    self.features = Features(
        {
            "image": Array3D(dtype="int64", shape=(3, 224, 224)),
            "input_ids": Sequence(feature=Value(dtype="int64")),
            "attention_mask": Sequence(Value(dtype="int64")),
            # "token_type_ids": Sequence(Value(dtype="int64")),
            "bbox": Array2D(dtype="int64", shape=(512, 4)),
            "labels": Sequence(ClassLabel(num_classes=len(self.labels), names=self.labels)),
        }
    )

`tokenize_dataset(dataset, target_model=None, image_column='image', label_column='label', batched=True, batch_size=2, cache_file_names=None, keep_in_memory=False, num_proc=None, processor_config=None, save_to_disk=False, save_path=None)` ¶

Tokenize a dataset using a target model and return a new dataset with the encoded features and labels.

Parameters¶

Dataset or DatasetDict, required

Dataset to be tokenized.

str, optional (default=None)

Target model to use for tokenization.

str (default="image")

Name of the column containing the image.

str (default="label")

Name of the column containing the label.

bool (default=True)

Whether to use batched encoding.

int, optional (default=2)

Batch size for batched encoding.

Dict[str, Optional[str]], optional (default=None)

Dictionary containing the cache file names for each target model.

bool (default=False)

Whether to keep the dataset in memory.

int, optional (default=None)

Number of processes to use for batched encoding.

Dict[str, Any], optional (default=None)

Configuration for the processor of the target model.

bool (default=False)

Whether to save the dataset to disk or not.

str (default=None)

Path to save the dataset to disk if save_to_disk is True.

Returns¶

DatasetDict Dataset with the encoded features and labels.

Raises¶

ValueError If there is no target model for the dataset. Or if saving to disk is requested but the save path is not provided. KeyError If the target model is not supported. TypeError If the dataset is not a Dataset or DatasetDict.

Source code in document_tools/tokenize.py

def tokenize_dataset(
    dataset: Union[Dataset, DatasetDict],
    target_model: str = None,
    image_column: str = "image",
    label_column: str = "label",
    batched: bool = True,
    batch_size: Optional[int] = 2,
    cache_file_names: Optional[Dict[str, Optional[str]]] = None,
    keep_in_memory: bool = False,
    num_proc: Optional[int] = None,
    processor_config: Optional[Dict[str, Any]] = None,
    save_to_disk: bool = False,
    save_path: str = None,
) -> DatasetDict:
    """
    Tokenize a dataset using a target model and return a new dataset with the encoded features and labels.

    Parameters
    ----------
    dataset : Dataset or DatasetDict, required
        Dataset to be tokenized.
    target_model : str, optional (default=None)
        Target model to use for tokenization.
    image_column : str (default="image")
        Name of the column containing the image.
    label_column : str (default="label")
        Name of the column containing the label.
    batched : bool (default=True)
        Whether to use batched encoding.
    batch_size : int, optional (default=2)
        Batch size for batched encoding.
    cache_file_names : Dict[str, Optional[str]], optional (default=None)
        Dictionary containing the cache file names for each target model.
    keep_in_memory : bool (default=False)
        Whether to keep the dataset in memory.
    num_proc : int, optional (default=None)
        Number of processes to use for batched encoding.
    processor_config : Dict[str, Any], optional (default=None)
        Configuration for the processor of the target model.
    save_to_disk : bool (default=False)
        Whether to save the dataset to disk or not.
    save_path : str (default=None)
        Path to save the dataset to disk if `save_to_disk` is True.

    Returns
    -------
    DatasetDict
        Dataset with the encoded features and labels.

    Raises
    ------
    ValueError
        If there is no target model for the dataset. Or if saving to disk is requested but the save path is not
        provided.
    KeyError
        If the target model is not supported.
    TypeError
        If the dataset is not a Dataset or DatasetDict.
    """
    if not target_model:
        raise ValueError("""You need to specify the target architecture you want to use to tokenize your dataset.""")
    else:
        try:
            TARGET_MODELS[target_model]
        except KeyError:
            raise KeyError(
                f"""
                You specified a `target_model` that is not supported. Available models: {list(TARGET_MODELS.keys())}
                If you think that new model should be available, please feel free to open a new issue on the project
                repository: https://github.com/deeptools-ai/document-tools/issues
            """
            )

    if save_to_disk and save_path is None:
        raise ValueError(
            """
            You need to specify a path to save the dataset, because you chose to save it to disk. You can disable saving
            to disk by setting `save_to_disk=False`.
        """
        )
    elif not save_to_disk and save_path is not None:
        logger.warning(
            """
            You have indicated a path to save the dataset, but have chosen not to save it to disk. You need to add
            `save_to_disk=True` to the call to `tokenize_dataset` to save the dataset to disk.
        """
        )
    else:
        logger.info(
            """
        The dataset will not be saved to disk. If you want to save it to disk, add `save_to_disk=True` to the call to
        `tokenize_dataset`.
        """
        )

    dataset = copy.deepcopy(dataset)
    if isinstance(dataset, DatasetDict):
        tmp_dataset = dataset
        dataset_first_key = list(tmp_dataset.keys())[0]
    elif isinstance(dataset, Dataset):
        tmp_dataset = DatasetDict()
        dataset_first_key = "train"
        tmp_dataset[dataset_first_key] = dataset
    else:
        raise TypeError(f"The dataset has to be either a `Dataset` or a `DatasetDict`. You provided: {type(dataset)}")

    if isinstance(tmp_dataset[dataset_first_key].features[label_column].feature, ClassLabel):
        labels = tmp_dataset[dataset_first_key].features[label_column].feature.names
    else:
        labels = _get_label_list(tmp_dataset[dataset_first_key][label_column])

    encoder = TARGET_MODELS[target_model](config=processor_config, labels=labels)
    features = encoder.features

    encoded_dataset = tmp_dataset.map(
        encoder,
        features=features,
        remove_columns=[image_column, label_column],
        batched=batched,
        batch_size=batch_size,
        cache_file_names=cache_file_names,
        keep_in_memory=keep_in_memory,
        num_proc=num_proc,
    )

    if save_to_disk:
        try:
            encoded_dataset.save_to_disk(save_path)  # type: ignore
        except Exception as e:
            logger.error(e)

    return encoded_dataset

Modules

LayoutLMv2Encoder ¶

__call__(batch) ¶

__init__(**kwargs) ¶

Parameters¶

LayoutLMv3Encoder ¶

__call__(batch) ¶

__init__(**kwargs) ¶

Parameters¶

LayoutXLMEncoder ¶

__call__(batch) ¶

__init__(**kwargs) ¶

Parameters¶

tokenize_dataset(dataset, target_model=None, image_column='image', label_column='label', batched=True, batch_size=2, cache_file_names=None, keep_in_memory=False, num_proc=None, processor_config=None, save_to_disk=False, save_path=None) ¶

Parameters¶

Returns¶

Raises¶

`LayoutLMv2Encoder` ¶

`call(batch)` ¶

`init(**kwargs)` ¶

`LayoutLMv3Encoder` ¶

`call(batch)` ¶

`init(**kwargs)` ¶

`LayoutXLMEncoder` ¶

`call(batch)` ¶

`init(**kwargs)` ¶

`tokenize_dataset(dataset, target_model=None, image_column='image', label_column='label', batched=True, batch_size=2, cache_file_names=None, keep_in_memory=False, num_proc=None, processor_config=None, save_to_disk=False, save_path=None)` ¶