Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ repos:
- id: fix-byte-order-marker
- id: name-tests-test
args: [ '--pytest-test-first' ]
exclude: ^tests/_duplicates.py$
- id: no-commit-to-branch
args: [ '--branch', 'main' ]
- id: trailing-whitespace
Expand Down
18 changes: 0 additions & 18 deletions docs/tool-overview-databundle.rst
Original file line number Diff line number Diff line change
Expand Up @@ -84,22 +84,4 @@ Now the meteorological data can be maqpped to the Common Data Model (CDM_) using

For more information how the mapping is working, please see :ref:`tool-overview-mapper` and/or :ref:`how-to-register-a-new-data-model-mapping`.

:ref:`dupdetect`
^^^^^^^^^^^^^^^^

After mapping to the CDM format it is useful to check if the CDM tables contain any duplicates. The duplicate checker included in the ``cdm_reader_mapper`` toolbox is based on python record linkage toolkit RecordLinkage_.

The first step is to call the method function :func:`.DataBundle.duplicate_check`. This function scans the CDM tables for any duplicates.

.. code-block:: console

db_dup = db.duplicate_check()

Afterwards there are two options how to deal with the detected duplicates:

1. :func:`.DataBundle.flag_duplicates`
2. :func:`.DataBundle.remove_duplicates`

The first function flags the detected duplicates. For more information about the flags see `CDM code tables for duplicate_status`_ and `CDM code tables for report_quality`_. The second function removes the detected duplicates.

.. include:: hyperlinks.rst
6 changes: 0 additions & 6 deletions src/cdm_reader_mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,6 @@
from .core.reader import read
from .core.writer import write
from .data import test_data
from .duplicates.duplicates import (
DupDetect,
duplicate_check,
)
from .mdf_reader.reader import read_data, read_mdf
from .mdf_reader.writer import write_data
from .metmetpy import (
Expand All @@ -35,11 +31,9 @@

__all__ = [
"DataBundle",
"DupDetect",
"cdm_tables",
"correct_datetime",
"correct_pt",
"duplicate_check",
"map_model",
"read",
"read_data",
Expand Down
207 changes: 0 additions & 207 deletions src/cdm_reader_mapper/core/databundle.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
split_by_index,
)
from cdm_reader_mapper.common.iterators import ParquetStreamReader, is_valid_iterator
from cdm_reader_mapper.duplicates.duplicates import DupDetect, duplicate_check
from cdm_reader_mapper.metmetpy import (
correct_datetime,
correct_pt,
Expand Down Expand Up @@ -154,7 +153,6 @@ def __init__(
self._mask: pd.DataFrame | ParquetStreamReader = mask
self._imodel = imodel
self._mode = mode
self.DupDetect: DupDetect | None = None

def __len__(self) -> int:
"""
Expand Down Expand Up @@ -1414,208 +1412,3 @@ def write(
mode=mode,
**kwargs,
)

def duplicate_check(self, inplace: bool = False, **kwargs: Any) -> DataBundle | None:
r"""
Duplicate check in :py:attr:`data`.

Parameters
----------
inplace : bool, default: False
If True overwrite :py:attr:`data` in :py:class:`~DataBundle`
else return a copy of :py:class:`~DataBundle` with :py:attr:`data` as CDM tables.
\**kwargs : Any
Additional keyword-arguments for duplicate check.

Returns
-------
:py:class:`~DataBundle` or None
DataBundle containing new :py:class:`~DupDetect` class for further duplicate check methods or None if "inplace=True".

See Also
--------
DataBundle.get_duplicates : Get duplicate matches in `data`.
DataBundle.flag_duplicates : Flag detected duplicates in `data`.
DataBundle.remove_duplicates : Remove detected duplicates in `data`.

Notes
-----
Following columns have to be provided:

* `longitude`
* `latitude`
* `primary_station_id`
* `report_timestamp`
* `station_course`
* `station_speed`

This adds a new class :py:class:`~DupDetect` to :py:class:`~DataBundle`.
This class is necessary for further duplicate check methods.

For more information see :py:func:`duplicate_check`

Examples
--------
>>> db.duplicate_check()
"""
db_ = self._get_db(inplace)
if db_ is None:
return None
if db_._mode == "tables" and "header" in db_._data:
data = db_._data["header"]
else:
data = db_._data
db_.DupDetect = duplicate_check(data, **kwargs)
return self._return_db(db_, inplace)

def flag_duplicates(self, inplace: bool = False, **kwargs: Any) -> DataBundle | None:
r"""
Flag detected duplicates in :py:attr:`data`.

Parameters
----------
inplace : bool, default: False
If True overwrite :py:attr:`data` in :py:class:`~DataBundle`
else return a copy of :py:class:`~DataBundle` with :py:attr:`data` containing flagged duplicates.
\**kwargs : Any
Additional keyword-arguments for flagging duplicates.

Returns
-------
:py:class:`~DataBundle` or None
DataBundle containing duplicate flags in :py:attr:`data` or None if "inplace=True".

Raises
------
RuntimeError
Before flagging duplicates, a duplictate check has to be done, :py:func:`DataBundle.duplicate_check`.

See Also
--------
DataBundle.remove_duplicates : Remove detected duplicates in `data`.
DataBundle.get_duplicates : Get duplicate matches in `data`.
DataBundle.duplicate_check : Duplicate check in `data`.

Notes
-----
For more information see :py:func:`DupDetect.flag_duplicates`

Examples
--------
Flag duplicates without overwriting :py:attr:`data`.

>>> flagged_tables = db.flag_duplicates()

Flag duplicates with overwriting :py:attr:`data`.

>>> db.flag_duplicates(inplace=True)
>>> flagged_tables = db.data
"""
db_ = self._get_db(inplace)
if db_ is None:
return None

if db_.DupDetect is None:
raise RuntimeError("Before flagging duplicates, a duplictate check has to be done: 'db.duplicate_check()'")

db_.DupDetect.flag_duplicates(**kwargs)

if db_._mode == "tables" and "header" in db_._data:
db_._data["header"] = db_.DupDetect.result
else:
db_._data = db_.DupDetect.result
return self._return_db(db_, inplace)

def get_duplicates(self, **kwargs: Any) -> pd.DataFrame:
r"""
Get duplicate matches in :py:attr:`data`.

Parameters
----------
\**kwargs : Any
Additional keyword-arguments used for getting duplicates.

Returns
-------
pd.DataFrame
DataFrame containing duplicate matches.

Raises
------
RuntimeError
Before getting duplicates, a duplictate check has to be done, :py:func:`DataBundle.duplicate_check`.

See Also
--------
DataBundle.remove_duplicates : Remove detected duplicates in `data`.
DataBundle.flag_duplicates : Flag detected duplicates in `data`.
DataBundle.duplicate_check : Duplicate check in `data`.

Notes
-----
For more information see :py:func:`DupDetect.get_duplicates`

Examples
--------
>>> matches = db.get_duplicates()
"""
if self.DupDetect is None:
raise RuntimeError("Before getting duplicates, a duplictate check has to be done: 'db.duplicate_check()'")
return self.DupDetect.get_duplicates(**kwargs)

def remove_duplicates(self, inplace: bool = False, **kwargs: Any) -> DataBundle | None:
r"""
Remove detected duplicates in :py:attr:`data`.

Parameters
----------
inplace : bool, default: False
If True overwrite :py:attr:`data` in :py:class:`~DataBundle`
else return a copy of :py:class:`~DataBundle` with :py:attr:`data` containing no duplicates.
\**kwargs : Any
Additional keyword-arguments used to remove duplicates.

Returns
-------
:py:class:`~DataBundle` or None
DataBundle without duplicated rows or None if "inplace=True".

Raises
------
RuntimeError
Before removing duplicates, a duplictate check has to be done, :py:func:`DataBundle.duplicate_check`.

See Also
--------
DataBundle.flag_duplicates : Flag detected duplicates in `data`.
DataBundle.get_duplicates : Get duplicate matches in `data`.
DataBundle.duplicate_check : Duplicate check in `data`.

Notes
-----
For more information see :py:func:`DupDetect.remove_duplicates`

Examples
--------
Remove duplicates without overwriting :py:attr:`data`.

>>> removed_tables = db.remove_duplicates()

Remove duplicates with overwriting :py:attr:`data`.

>>> db.remove_duplicates(inplace=True)
>>> removed_tables = db.data
"""
db_ = self._get_db(inplace)
if db_ is None:
return None

if db_.DupDetect is None:
raise RuntimeError("Before removing duplicates, a duplictate check has to be done: 'db.duplicate_check()'")

db_.DupDetect.remove_duplicates(**kwargs)
header_ = db_.DupDetect.result
if not isinstance(db_._data, pd.DataFrame):
raise TypeError("data has unsupported type: {type(db_._data)}.")
db_._data = db_._data[db_._data.index.isin(header_.index)]
return self._return_db(db_, inplace)
3 changes: 0 additions & 3 deletions src/cdm_reader_mapper/duplicates/__init__.py

This file was deleted.

77 changes: 0 additions & 77 deletions src/cdm_reader_mapper/duplicates/_duplicate_settings.py

This file was deleted.

Loading
Loading