Skip to content

Modules

Top-level package for Document Tools.

LayoutLMv2Encoder

Bases: BaseEncoder

LayoutLMv2Encoder is the encoder for datasets using LayoutLMv2.

Source code in document_tools/encoders/encoders.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
class LayoutLMv2Encoder(BaseEncoder):
    """LayoutLMv2Encoder is the encoder for datasets using LayoutLMv2."""

    def __init__(self, **kwargs):
        """
        Initialize the LayoutLMv2Encoder.

        Parameters
        ----------
        kwargs : Dict[str, Any]
            Check the documentation of the LayoutLMv2Processor for the available parameters :
            https://huggingface.co/docs/transformers/model_doc/layoutlmv2#transformers.LayoutLMv2Processor
        """
        super().__init__(**kwargs)
        self.default_model = self.config.get("default_model", "microsoft/layoutlmv2-base-uncased")
        self.processor = LayoutLMv2Processor.from_pretrained(self.default_model, **self.config)
        self.features = Features(
            {
                "image": Array3D(dtype="int64", shape=(3, 224, 224)),
                "input_ids": Sequence(feature=Value(dtype="int64")),
                "attention_mask": Sequence(Value(dtype="int64")),
                "token_type_ids": Sequence(Value(dtype="int64")),
                "bbox": Array2D(dtype="int64", shape=(512, 4)),
                "labels": Sequence(ClassLabel(num_classes=len(self.labels), names=self.labels)),
            }
        )

    def __call__(self, batch: Dict[str, List]):
        """Call the LayoutLMv2Encoder."""
        images = [image.convert("RGB") for image in batch["image"]]
        encoded_inputs = self.processor(images)
        encoded_inputs["labels"] = [label for label in batch["label"]]
        return encoded_inputs

__call__(batch)

Call the LayoutLMv2Encoder.

Source code in document_tools/encoders/encoders.py
83
84
85
86
87
88
def __call__(self, batch: Dict[str, List]):
    """Call the LayoutLMv2Encoder."""
    images = [image.convert("RGB") for image in batch["image"]]
    encoded_inputs = self.processor(images)
    encoded_inputs["labels"] = [label for label in batch["label"]]
    return encoded_inputs

__init__(**kwargs)

Initialize the LayoutLMv2Encoder.

Parameters
Dict[str, Any]

Check the documentation of the LayoutLMv2Processor for the available parameters : https://huggingface.co/docs/transformers/model_doc/layoutlmv2#transformers.LayoutLMv2Processor

Source code in document_tools/encoders/encoders.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def __init__(self, **kwargs):
    """
    Initialize the LayoutLMv2Encoder.

    Parameters
    ----------
    kwargs : Dict[str, Any]
        Check the documentation of the LayoutLMv2Processor for the available parameters :
        https://huggingface.co/docs/transformers/model_doc/layoutlmv2#transformers.LayoutLMv2Processor
    """
    super().__init__(**kwargs)
    self.default_model = self.config.get("default_model", "microsoft/layoutlmv2-base-uncased")
    self.processor = LayoutLMv2Processor.from_pretrained(self.default_model, **self.config)
    self.features = Features(
        {
            "image": Array3D(dtype="int64", shape=(3, 224, 224)),
            "input_ids": Sequence(feature=Value(dtype="int64")),
            "attention_mask": Sequence(Value(dtype="int64")),
            "token_type_ids": Sequence(Value(dtype="int64")),
            "bbox": Array2D(dtype="int64", shape=(512, 4)),
            "labels": Sequence(ClassLabel(num_classes=len(self.labels), names=self.labels)),
        }
    )

LayoutLMv3Encoder

Bases: BaseEncoder

LayoutLMv3Encoder is the encoder for datasets using LayoutLMv3.

Source code in document_tools/encoders/encoders.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
class LayoutLMv3Encoder(BaseEncoder):
    """LayoutLMv3Encoder is the encoder for datasets using LayoutLMv3."""

    def __init__(self, **kwargs):
        """
        Initialize the LayoutLMv3Encoder.

        Parameters
        ----------
        kwargs : Dict[str, Any]
            Check the documentation of the LayoutLMv3Processor for the available parameters :
            https://huggingface.co/docs/transformers/model_doc/layoutlmv3#transformers.LayoutLMv3Processor
        """
        super().__init__(**kwargs)
        self.default_model = self.config.get("default_model", "microsoft/layoutlmv3-base")
        self.processor = LayoutLMv3Processor.from_pretrained(self.default_model, **self.config)
        self.features = Features(
            {
                "pixel_values": Array3D(dtype="float32", shape=(3, 224, 224)),
                "input_ids": Sequence(feature=Value(dtype="int64")),
                "attention_mask": Sequence(Value(dtype="int64")),
                "bbox": Array2D(dtype="int64", shape=(512, 4)),
                "labels": Sequence(feature=Value(dtype='int64')),
            }
        )

    def __call__(self, batch: Dict[str, List]):
        """Call the LayoutLMv3Encoder."""
        images = [image.convert("RGB") for image in batch["image"]]
        encoded_inputs = self.processor(images)
        encoded_inputs["labels"] = [label for label in batch["label"]]
        return encoded_inputs

__call__(batch)

Call the LayoutLMv3Encoder.

Source code in document_tools/encoders/encoders.py
117
118
119
120
121
122
def __call__(self, batch: Dict[str, List]):
    """Call the LayoutLMv3Encoder."""
    images = [image.convert("RGB") for image in batch["image"]]
    encoded_inputs = self.processor(images)
    encoded_inputs["labels"] = [label for label in batch["label"]]
    return encoded_inputs

__init__(**kwargs)

Initialize the LayoutLMv3Encoder.

Parameters
Dict[str, Any]

Check the documentation of the LayoutLMv3Processor for the available parameters : https://huggingface.co/docs/transformers/model_doc/layoutlmv3#transformers.LayoutLMv3Processor

Source code in document_tools/encoders/encoders.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def __init__(self, **kwargs):
    """
    Initialize the LayoutLMv3Encoder.

    Parameters
    ----------
    kwargs : Dict[str, Any]
        Check the documentation of the LayoutLMv3Processor for the available parameters :
        https://huggingface.co/docs/transformers/model_doc/layoutlmv3#transformers.LayoutLMv3Processor
    """
    super().__init__(**kwargs)
    self.default_model = self.config.get("default_model", "microsoft/layoutlmv3-base")
    self.processor = LayoutLMv3Processor.from_pretrained(self.default_model, **self.config)
    self.features = Features(
        {
            "pixel_values": Array3D(dtype="float32", shape=(3, 224, 224)),
            "input_ids": Sequence(feature=Value(dtype="int64")),
            "attention_mask": Sequence(Value(dtype="int64")),
            "bbox": Array2D(dtype="int64", shape=(512, 4)),
            "labels": Sequence(feature=Value(dtype='int64')),
        }
    )

LayoutXLMEncoder

Bases: BaseEncoder

LayoutXLMEncoder is the encoder for datasets using LayoutXLM.

Source code in document_tools/encoders/encoders.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
class LayoutXLMEncoder(BaseEncoder):
    """LayoutXLMEncoder is the encoder for datasets using LayoutXLM."""

    def __init__(self, **kwargs):
        """
        Initialize the LayoutXLMEncoder.

        Parameters
        ----------
        kwargs : Dict[str, Any]
            Check the documentation of the LayoutXLMProcessor for the available parameters :
            https://huggingface.co/docs/transformers/model_doc/layoutxlm#transformers.LayoutXLMProcessor
        """
        super().__init__(**kwargs)
        self.default_model = self.config.get("default_model", "microsoft/layoutxlm-base")
        self.config["return_token_type_ids"] = True
        self.processor = LayoutXLMProcessor.from_pretrained(self.default_model, **self.config)
        self.features = Features(
            {
                "image": Array3D(dtype="int64", shape=(3, 224, 224)),
                "input_ids": Sequence(feature=Value(dtype="int64")),
                "attention_mask": Sequence(Value(dtype="int64")),
                # "token_type_ids": Sequence(Value(dtype="int64")),
                "bbox": Array2D(dtype="int64", shape=(512, 4)),
                "labels": Sequence(ClassLabel(num_classes=len(self.labels), names=self.labels)),
            }
        )

    def __call__(self, batch: Dict[str, List]):
        """Call the LayoutXLMEncoder."""
        images = [image.convert("RGB") for image in batch["image"]]
        encoded_inputs = self.processor(images)
        encoded_inputs["labels"] = [label for label in batch["label"]]
        return encoded_inputs

__call__(batch)

Call the LayoutXLMEncoder.

Source code in document_tools/encoders/encoders.py
153
154
155
156
157
158
def __call__(self, batch: Dict[str, List]):
    """Call the LayoutXLMEncoder."""
    images = [image.convert("RGB") for image in batch["image"]]
    encoded_inputs = self.processor(images)
    encoded_inputs["labels"] = [label for label in batch["label"]]
    return encoded_inputs

__init__(**kwargs)

Initialize the LayoutXLMEncoder.

Parameters
Dict[str, Any]

Check the documentation of the LayoutXLMProcessor for the available parameters : https://huggingface.co/docs/transformers/model_doc/layoutxlm#transformers.LayoutXLMProcessor

Source code in document_tools/encoders/encoders.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
def __init__(self, **kwargs):
    """
    Initialize the LayoutXLMEncoder.

    Parameters
    ----------
    kwargs : Dict[str, Any]
        Check the documentation of the LayoutXLMProcessor for the available parameters :
        https://huggingface.co/docs/transformers/model_doc/layoutxlm#transformers.LayoutXLMProcessor
    """
    super().__init__(**kwargs)
    self.default_model = self.config.get("default_model", "microsoft/layoutxlm-base")
    self.config["return_token_type_ids"] = True
    self.processor = LayoutXLMProcessor.from_pretrained(self.default_model, **self.config)
    self.features = Features(
        {
            "image": Array3D(dtype="int64", shape=(3, 224, 224)),
            "input_ids": Sequence(feature=Value(dtype="int64")),
            "attention_mask": Sequence(Value(dtype="int64")),
            # "token_type_ids": Sequence(Value(dtype="int64")),
            "bbox": Array2D(dtype="int64", shape=(512, 4)),
            "labels": Sequence(ClassLabel(num_classes=len(self.labels), names=self.labels)),
        }
    )

tokenize_dataset(dataset, target_model=None, image_column='image', label_column='label', batched=True, batch_size=2, cache_file_names=None, keep_in_memory=False, num_proc=None, processor_config=None, save_to_disk=False, save_path=None)

Tokenize a dataset using a target model and return a new dataset with the encoded features and labels.

Parameters
Dataset or DatasetDict, required

Dataset to be tokenized.

str, optional (default=None)

Target model to use for tokenization.

str (default="image")

Name of the column containing the image.

str (default="label")

Name of the column containing the label.

bool (default=True)

Whether to use batched encoding.

int, optional (default=2)

Batch size for batched encoding.

Dict[str, Optional[str]], optional (default=None)

Dictionary containing the cache file names for each target model.

bool (default=False)

Whether to keep the dataset in memory.

int, optional (default=None)

Number of processes to use for batched encoding.

Dict[str, Any], optional (default=None)

Configuration for the processor of the target model.

bool (default=False)

Whether to save the dataset to disk or not.

str (default=None)

Path to save the dataset to disk if save_to_disk is True.

Returns

DatasetDict Dataset with the encoded features and labels.

Raises

ValueError If there is no target model for the dataset. Or if saving to disk is requested but the save path is not provided. KeyError If the target model is not supported. TypeError If the dataset is not a Dataset or DatasetDict.

Source code in document_tools/tokenize.py
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
def tokenize_dataset(
    dataset: Union[Dataset, DatasetDict],
    target_model: str = None,
    image_column: str = "image",
    label_column: str = "label",
    batched: bool = True,
    batch_size: Optional[int] = 2,
    cache_file_names: Optional[Dict[str, Optional[str]]] = None,
    keep_in_memory: bool = False,
    num_proc: Optional[int] = None,
    processor_config: Optional[Dict[str, Any]] = None,
    save_to_disk: bool = False,
    save_path: str = None,
) -> DatasetDict:
    """
    Tokenize a dataset using a target model and return a new dataset with the encoded features and labels.

    Parameters
    ----------
    dataset : Dataset or DatasetDict, required
        Dataset to be tokenized.
    target_model : str, optional (default=None)
        Target model to use for tokenization.
    image_column : str (default="image")
        Name of the column containing the image.
    label_column : str (default="label")
        Name of the column containing the label.
    batched : bool (default=True)
        Whether to use batched encoding.
    batch_size : int, optional (default=2)
        Batch size for batched encoding.
    cache_file_names : Dict[str, Optional[str]], optional (default=None)
        Dictionary containing the cache file names for each target model.
    keep_in_memory : bool (default=False)
        Whether to keep the dataset in memory.
    num_proc : int, optional (default=None)
        Number of processes to use for batched encoding.
    processor_config : Dict[str, Any], optional (default=None)
        Configuration for the processor of the target model.
    save_to_disk : bool (default=False)
        Whether to save the dataset to disk or not.
    save_path : str (default=None)
        Path to save the dataset to disk if `save_to_disk` is True.

    Returns
    -------
    DatasetDict
        Dataset with the encoded features and labels.

    Raises
    ------
    ValueError
        If there is no target model for the dataset. Or if saving to disk is requested but the save path is not
        provided.
    KeyError
        If the target model is not supported.
    TypeError
        If the dataset is not a Dataset or DatasetDict.
    """
    if not target_model:
        raise ValueError("""You need to specify the target architecture you want to use to tokenize your dataset.""")
    else:
        try:
            TARGET_MODELS[target_model]
        except KeyError:
            raise KeyError(
                f"""
                You specified a `target_model` that is not supported. Available models: {list(TARGET_MODELS.keys())}
                If you think that new model should be available, please feel free to open a new issue on the project
                repository: https://github.com/deeptools-ai/document-tools/issues
            """
            )

    if save_to_disk and save_path is None:
        raise ValueError(
            """
            You need to specify a path to save the dataset, because you chose to save it to disk. You can disable saving
            to disk by setting `save_to_disk=False`.
        """
        )
    elif not save_to_disk and save_path is not None:
        logger.warning(
            """
            You have indicated a path to save the dataset, but have chosen not to save it to disk. You need to add
            `save_to_disk=True` to the call to `tokenize_dataset` to save the dataset to disk.
        """
        )
    else:
        logger.info(
            """
        The dataset will not be saved to disk. If you want to save it to disk, add `save_to_disk=True` to the call to
        `tokenize_dataset`.
        """
        )

    dataset = copy.deepcopy(dataset)
    if isinstance(dataset, DatasetDict):
        tmp_dataset = dataset
        dataset_first_key = list(tmp_dataset.keys())[0]
    elif isinstance(dataset, Dataset):
        tmp_dataset = DatasetDict()
        dataset_first_key = "train"
        tmp_dataset[dataset_first_key] = dataset
    else:
        raise TypeError(f"The dataset has to be either a `Dataset` or a `DatasetDict`. You provided: {type(dataset)}")

    if isinstance(tmp_dataset[dataset_first_key].features[label_column].feature, ClassLabel):
        labels = tmp_dataset[dataset_first_key].features[label_column].feature.names
    else:
        labels = _get_label_list(tmp_dataset[dataset_first_key][label_column])

    encoder = TARGET_MODELS[target_model](config=processor_config, labels=labels)
    features = encoder.features

    encoded_dataset = tmp_dataset.map(
        encoder,
        features=features,
        remove_columns=[image_column, label_column],
        batched=batched,
        batch_size=batch_size,
        cache_file_names=cache_file_names,
        keep_in_memory=keep_in_memory,
        num_proc=num_proc,
    )

    if save_to_disk:
        try:
            encoded_dataset.save_to_disk(save_path)  # type: ignore
        except Exception as e:
            logger.error(e)

    return encoded_dataset