Edit on GitHub

uio.utility.datasets.pandas

Data processing with Pandas tables.

It is important to import this module with an alias, for example:

from uio.utility.datasets import pandas as pnd

Otherwise it will collide with the pandas module itself.

  1"""
  2Data processing with [Pandas](https://pandas.pydata.org) tables.
  3
  4It is important to import this module with an alias, for example:
  5
  6``` py
  7from uio.utility.datasets import pandas as pnd
  8```
  9
 10Otherwise it will collide with the `pandas` module itself.
 11"""
 12
 13import pandas
 14
 15from typing import Optional, List, Union
 16
 17from ..logs.log import logger
 18
 19
 20def mergeTables(frames: List[pandas.DataFrame]) -> pandas.DataFrame:
 21    """
 22    Merge (*concatenate*) several tables into one. Sorts the index
 23    and verifies integrity (*will raise an exception
 24    on duplicate/overlapping index*).
 25
 26    Example:
 27
 28    ``` py
 29    from uio.utility.datasets import pandas as pnd
 30
 31    frames = []
 32    # ...
 33    # frames.append(someTbl1)
 34    # frames.append(someTbl2)
 35    # frames.append(someTbl3)
 36    # ...
 37    tbl = pnd.mergeTables(frames)
 38    #print(tbl.head(15))
 39    ```
 40    """
 41
 42    mergedTable = pandas.concat(
 43        frames,
 44        verify_integrity=True
 45    ).sort_index()
 46
 47    logger.debug(f"Total records in the resulting table: {len(mergedTable)}")
 48    # logger.debug("Preview of the first rows:")
 49    # logger.debug(mergedTable.head(15))
 50
 51    return mergedTable
 52
 53
 54def deduplicateTable(
 55    tbl: pandas.DataFrame,
 56    returnUniques: bool = True
 57) -> pandas.DataFrame:
 58    """
 59    Find duplicate rows in the table:
 60
 61    - with `returnUniques` set to `True`, returns a table without duplicate
 62    rows. The first occurrence is always considered to be unique
 63    in the resulting table, even if it has duplicates;
 64    - with `returnUniques` set to `False` returns only duplicate rows.
 65
 66    Example:
 67
 68    ``` py
 69    import pandas
 70    from uio.utility.datasets import pandas as pnd
 71
 72    tbl = pandas.DataFrame(
 73        [
 74            [1, 2, 3],
 75            [4, 5, 6],
 76            [1, 2, 3],
 77            [7, 8, 9],
 78            [4, 5, 6],
 79            [3, 1, 2],
 80            [4, 5, 6]
 81        ],
 82        index=[1, 2, 3, 4, 5, 6, 7],
 83        columns=["a", "b", "c"]
 84    )
 85
 86    deduplicatedTable = pnd.deduplicateTable(tbl, True)
 87    #print(deduplicatedTable)
 88    #    a  b  c
 89    # 1  1  2  3
 90    # 2  4  5  6
 91    # 4  7  8  9
 92    # 6  3  1  2
 93    #print(deduplicatedTable.index.values)
 94    # [1 2 4 6]
 95
 96    duplicateRows = pnd.deduplicateTable(tbl, False)
 97    #print(duplicateRows)
 98    #    a  b  c
 99    # 3  1  2  3
100    # 5  4  5  6
101    # 7  4  5  6
102    #print(duplicateRows.index.values)
103    # [3 5 7]
104    ```
105    """
106    duplicateRows = tbl.duplicated()
107    duplicateRows = duplicateRows[duplicateRows].index
108
109    logger.debug(f"Unique rows count: {len(tbl) - len(duplicateRows)}")
110    logger.debug(f"Duplicate rows count: {len(duplicateRows)}")
111    # logger.debug(f"Indexes of duplicate rows: {duplicateRows}")
112
113    duplicatesIndex = tbl.index.isin(duplicateRows)
114
115    return (
116        tbl[~duplicatesIndex] if returnUniques
117        else tbl[duplicatesIndex]
118    )
119
120
121def dropMeaninglessRows(
122    tbl: pandas.DataFrame,
123    # the types in Union can be extended with whatever else is supported
124    indicatorColumn: Optional[Union[str, int, float]] = None
125) -> pandas.DataFrame:
126    """
127    Drop/remove meaningless rows from the table. For instance, if a table
128    represents some event timeline, in which there are periods
129    of activity/changes with periods of no changes between them, then it might
130    make sense to drop those rows.
131
132    At the same time, the meaningless rows just before and right after
133    meaningful rows should are not dropped/removed, as they show points of time
134    when activity started/ended. The first row is always kept and is used
135    as a baseline for comparison.
136
137    If `indicatorColumn` is provided, than that cell value from the baseline
138    row is used to determine meaningless rows. Otherwise, the entire rows
139    will be compared with the baseline one.
140
141    Example:
142
143    ``` py
144    import pandas
145    from uio.utility.datasets import pandas as pnd
146
147    tbl = pandas.DataFrame(
148        {
149            "a": [0, 0, 23, 2,  0, 0, 0, 19, 1,  0, 0],
150            "b": [0, 0, 2,  3,  0, 0, 0, 4,  3,  0, 0],
151            "c": [0, 0, 65, 34, 0, 0, 0, 45, 2,  0, 0],
152            "d": [0, 0, 33, 7,  0, 0, 0, 64, 33, 0, 0],
153            "e": [0, 0, 0,  45, 0, 0, 0, 12, 11, 0, 0]
154        },
155        index=[4, 21, 30, 57, 59, 62, 71, 80, 81, 102, 126]
156    )
157    #print(tbl)
158    #       a  b   c   d   e
159    # 4     0  0   0   0   0
160    # 21    0  0   0   0   0
161    # 30   23  2  65  33   0
162    # 57    2  3  34   7  45
163    # 59    0  0   0   0   0
164    # 62    0  0   0   0   0
165    # 71    0  0   0   0   0
166    # 80   19  4  45  64  12
167    # 81    1  3   2  33  11
168    # 102   0  0   0   0   0
169    # 126   0  0   0   0   0
170
171    rez = pnd.dropMeaninglessRows(tbl)
172    #rez = pnd.dropMeaninglessRows(tbl, "a")
173    #print(rez)
174    #       a  b   c   d   e
175    # 4     0  0   0   0   0
176    # 21    0  0   0   0   0
177    # 30   23  2  65  33   0
178    # 57    2  3  34   7  45
179    # 59    0  0   0   0   0
180    # 71    0  0   0   0   0
181    # 80   19  4  45  64  12
182    # 81    1  3   2  33  11
183    # 102   0  0   0   0   0
184
185    rez = pnd.dropMeaninglessRows(tbl, "e")
186    #print(rez)
187    #       a  b   c   d   e
188    # 4     0  0   0   0   0
189    # 30   23  2  65  33   0
190    # 57    2  3  34   7  45
191    # 59    0  0   0   0   0
192    # 71    0  0   0   0   0
193    # 80   19  4  45  64  12
194    # 81    1  3   2  33  11
195    # 102   0  0   0   0   0
196    ```
197    """
198    logger.debug(f"Original table:\n{tbl}")
199
200    indexLength = len(tbl.index)
201    rowsToDelete = []
202    if indicatorColumn:  # it is enough to compare just one column values
203        if indicatorColumn not in tbl.columns:
204            raise ValueError(f"Table has no column [{indicatorColumn}]")
205        for i in range(1, indexLength):  # first row is always kept
206            if (
207                tbl.at[tbl.index[i], indicatorColumn]
208                == tbl.at[tbl.index[0], indicatorColumn]
209                and
210                tbl.at[tbl.index[i-1], indicatorColumn]
211                == tbl.at[tbl.index[0], indicatorColumn]
212            ):
213                if i < indexLength - 1:  # not the last row
214                    if (
215                        tbl.at[tbl.index[i+1], indicatorColumn]
216                        == tbl.at[tbl.index[0], indicatorColumn]
217                    ):
218                        rowsToDelete.append(tbl.index[i])
219                else:
220                    rowsToDelete.append(tbl.index[i])
221    else:  # have to compare the entire rows
222        for i in range(1, indexLength):  # first row is always kept
223            if (
224                tbl.iloc[i].equals(tbl.iloc[0])
225                and
226                tbl.iloc[i-1].equals(tbl.iloc[0])
227            ):
228                if i < indexLength - 1:  # not the last row
229                    if tbl.iloc[i+1].equals(tbl.iloc[0]):
230                        rowsToDelete.append(tbl.index[i])
231                else:
232                    rowsToDelete.append(tbl.index[i])
233
234    logger.debug(f"Indexes of the rows to drop: {rowsToDelete}")
235
236    tbl = tbl.drop(index=rowsToDelete)
237
238    logger.debug(f"Table without meaningless rows:\n{tbl}")
239
240    return tbl
def mergeTables(frames: List[pandas.core.frame.DataFrame]) -> pandas.core.frame.DataFrame:
21def mergeTables(frames: List[pandas.DataFrame]) -> pandas.DataFrame:
22    """
23    Merge (*concatenate*) several tables into one. Sorts the index
24    and verifies integrity (*will raise an exception
25    on duplicate/overlapping index*).
26
27    Example:
28
29    ``` py
30    from uio.utility.datasets import pandas as pnd
31
32    frames = []
33    # ...
34    # frames.append(someTbl1)
35    # frames.append(someTbl2)
36    # frames.append(someTbl3)
37    # ...
38    tbl = pnd.mergeTables(frames)
39    #print(tbl.head(15))
40    ```
41    """
42
43    mergedTable = pandas.concat(
44        frames,
45        verify_integrity=True
46    ).sort_index()
47
48    logger.debug(f"Total records in the resulting table: {len(mergedTable)}")
49    # logger.debug("Preview of the first rows:")
50    # logger.debug(mergedTable.head(15))
51
52    return mergedTable

Merge (concatenate) several tables into one. Sorts the index and verifies integrity (will raise an exception on duplicate/overlapping index).

Example:

from uio.utility.datasets import pandas as pnd

frames = []
# ...
# frames.append(someTbl1)
# frames.append(someTbl2)
# frames.append(someTbl3)
# ...
tbl = pnd.mergeTables(frames)
#print(tbl.head(15))
def deduplicateTable( tbl: pandas.core.frame.DataFrame, returnUniques: bool = True) -> pandas.core.frame.DataFrame:
 55def deduplicateTable(
 56    tbl: pandas.DataFrame,
 57    returnUniques: bool = True
 58) -> pandas.DataFrame:
 59    """
 60    Find duplicate rows in the table:
 61
 62    - with `returnUniques` set to `True`, returns a table without duplicate
 63    rows. The first occurrence is always considered to be unique
 64    in the resulting table, even if it has duplicates;
 65    - with `returnUniques` set to `False` returns only duplicate rows.
 66
 67    Example:
 68
 69    ``` py
 70    import pandas
 71    from uio.utility.datasets import pandas as pnd
 72
 73    tbl = pandas.DataFrame(
 74        [
 75            [1, 2, 3],
 76            [4, 5, 6],
 77            [1, 2, 3],
 78            [7, 8, 9],
 79            [4, 5, 6],
 80            [3, 1, 2],
 81            [4, 5, 6]
 82        ],
 83        index=[1, 2, 3, 4, 5, 6, 7],
 84        columns=["a", "b", "c"]
 85    )
 86
 87    deduplicatedTable = pnd.deduplicateTable(tbl, True)
 88    #print(deduplicatedTable)
 89    #    a  b  c
 90    # 1  1  2  3
 91    # 2  4  5  6
 92    # 4  7  8  9
 93    # 6  3  1  2
 94    #print(deduplicatedTable.index.values)
 95    # [1 2 4 6]
 96
 97    duplicateRows = pnd.deduplicateTable(tbl, False)
 98    #print(duplicateRows)
 99    #    a  b  c
100    # 3  1  2  3
101    # 5  4  5  6
102    # 7  4  5  6
103    #print(duplicateRows.index.values)
104    # [3 5 7]
105    ```
106    """
107    duplicateRows = tbl.duplicated()
108    duplicateRows = duplicateRows[duplicateRows].index
109
110    logger.debug(f"Unique rows count: {len(tbl) - len(duplicateRows)}")
111    logger.debug(f"Duplicate rows count: {len(duplicateRows)}")
112    # logger.debug(f"Indexes of duplicate rows: {duplicateRows}")
113
114    duplicatesIndex = tbl.index.isin(duplicateRows)
115
116    return (
117        tbl[~duplicatesIndex] if returnUniques
118        else tbl[duplicatesIndex]
119    )

Find duplicate rows in the table:

  • with returnUniques set to True, returns a table without duplicate rows. The first occurrence is always considered to be unique in the resulting table, even if it has duplicates;
  • with returnUniques set to False returns only duplicate rows.

Example:

import pandas
from uio.utility.datasets import pandas as pnd

tbl = pandas.DataFrame(
    [
        [1, 2, 3],
        [4, 5, 6],
        [1, 2, 3],
        [7, 8, 9],
        [4, 5, 6],
        [3, 1, 2],
        [4, 5, 6]
    ],
    index=[1, 2, 3, 4, 5, 6, 7],
    columns=["a", "b", "c"]
)

deduplicatedTable = pnd.deduplicateTable(tbl, True)
#print(deduplicatedTable)
#    a  b  c
# 1  1  2  3
# 2  4  5  6
# 4  7  8  9
# 6  3  1  2
#print(deduplicatedTable.index.values)
# [1 2 4 6]

duplicateRows = pnd.deduplicateTable(tbl, False)
#print(duplicateRows)
#    a  b  c
# 3  1  2  3
# 5  4  5  6
# 7  4  5  6
#print(duplicateRows.index.values)
# [3 5 7]
def dropMeaninglessRows( tbl: pandas.core.frame.DataFrame, indicatorColumn: Union[str, int, float, NoneType] = None) -> pandas.core.frame.DataFrame:
122def dropMeaninglessRows(
123    tbl: pandas.DataFrame,
124    # the types in Union can be extended with whatever else is supported
125    indicatorColumn: Optional[Union[str, int, float]] = None
126) -> pandas.DataFrame:
127    """
128    Drop/remove meaningless rows from the table. For instance, if a table
129    represents some event timeline, in which there are periods
130    of activity/changes with periods of no changes between them, then it might
131    make sense to drop those rows.
132
133    At the same time, the meaningless rows just before and right after
134    meaningful rows should are not dropped/removed, as they show points of time
135    when activity started/ended. The first row is always kept and is used
136    as a baseline for comparison.
137
138    If `indicatorColumn` is provided, than that cell value from the baseline
139    row is used to determine meaningless rows. Otherwise, the entire rows
140    will be compared with the baseline one.
141
142    Example:
143
144    ``` py
145    import pandas
146    from uio.utility.datasets import pandas as pnd
147
148    tbl = pandas.DataFrame(
149        {
150            "a": [0, 0, 23, 2,  0, 0, 0, 19, 1,  0, 0],
151            "b": [0, 0, 2,  3,  0, 0, 0, 4,  3,  0, 0],
152            "c": [0, 0, 65, 34, 0, 0, 0, 45, 2,  0, 0],
153            "d": [0, 0, 33, 7,  0, 0, 0, 64, 33, 0, 0],
154            "e": [0, 0, 0,  45, 0, 0, 0, 12, 11, 0, 0]
155        },
156        index=[4, 21, 30, 57, 59, 62, 71, 80, 81, 102, 126]
157    )
158    #print(tbl)
159    #       a  b   c   d   e
160    # 4     0  0   0   0   0
161    # 21    0  0   0   0   0
162    # 30   23  2  65  33   0
163    # 57    2  3  34   7  45
164    # 59    0  0   0   0   0
165    # 62    0  0   0   0   0
166    # 71    0  0   0   0   0
167    # 80   19  4  45  64  12
168    # 81    1  3   2  33  11
169    # 102   0  0   0   0   0
170    # 126   0  0   0   0   0
171
172    rez = pnd.dropMeaninglessRows(tbl)
173    #rez = pnd.dropMeaninglessRows(tbl, "a")
174    #print(rez)
175    #       a  b   c   d   e
176    # 4     0  0   0   0   0
177    # 21    0  0   0   0   0
178    # 30   23  2  65  33   0
179    # 57    2  3  34   7  45
180    # 59    0  0   0   0   0
181    # 71    0  0   0   0   0
182    # 80   19  4  45  64  12
183    # 81    1  3   2  33  11
184    # 102   0  0   0   0   0
185
186    rez = pnd.dropMeaninglessRows(tbl, "e")
187    #print(rez)
188    #       a  b   c   d   e
189    # 4     0  0   0   0   0
190    # 30   23  2  65  33   0
191    # 57    2  3  34   7  45
192    # 59    0  0   0   0   0
193    # 71    0  0   0   0   0
194    # 80   19  4  45  64  12
195    # 81    1  3   2  33  11
196    # 102   0  0   0   0   0
197    ```
198    """
199    logger.debug(f"Original table:\n{tbl}")
200
201    indexLength = len(tbl.index)
202    rowsToDelete = []
203    if indicatorColumn:  # it is enough to compare just one column values
204        if indicatorColumn not in tbl.columns:
205            raise ValueError(f"Table has no column [{indicatorColumn}]")
206        for i in range(1, indexLength):  # first row is always kept
207            if (
208                tbl.at[tbl.index[i], indicatorColumn]
209                == tbl.at[tbl.index[0], indicatorColumn]
210                and
211                tbl.at[tbl.index[i-1], indicatorColumn]
212                == tbl.at[tbl.index[0], indicatorColumn]
213            ):
214                if i < indexLength - 1:  # not the last row
215                    if (
216                        tbl.at[tbl.index[i+1], indicatorColumn]
217                        == tbl.at[tbl.index[0], indicatorColumn]
218                    ):
219                        rowsToDelete.append(tbl.index[i])
220                else:
221                    rowsToDelete.append(tbl.index[i])
222    else:  # have to compare the entire rows
223        for i in range(1, indexLength):  # first row is always kept
224            if (
225                tbl.iloc[i].equals(tbl.iloc[0])
226                and
227                tbl.iloc[i-1].equals(tbl.iloc[0])
228            ):
229                if i < indexLength - 1:  # not the last row
230                    if tbl.iloc[i+1].equals(tbl.iloc[0]):
231                        rowsToDelete.append(tbl.index[i])
232                else:
233                    rowsToDelete.append(tbl.index[i])
234
235    logger.debug(f"Indexes of the rows to drop: {rowsToDelete}")
236
237    tbl = tbl.drop(index=rowsToDelete)
238
239    logger.debug(f"Table without meaningless rows:\n{tbl}")
240
241    return tbl

Drop/remove meaningless rows from the table. For instance, if a table represents some event timeline, in which there are periods of activity/changes with periods of no changes between them, then it might make sense to drop those rows.

At the same time, the meaningless rows just before and right after meaningful rows should are not dropped/removed, as they show points of time when activity started/ended. The first row is always kept and is used as a baseline for comparison.

If indicatorColumn is provided, than that cell value from the baseline row is used to determine meaningless rows. Otherwise, the entire rows will be compared with the baseline one.

Example:

import pandas
from uio.utility.datasets import pandas as pnd

tbl = pandas.DataFrame(
    {
        "a": [0, 0, 23, 2,  0, 0, 0, 19, 1,  0, 0],
        "b": [0, 0, 2,  3,  0, 0, 0, 4,  3,  0, 0],
        "c": [0, 0, 65, 34, 0, 0, 0, 45, 2,  0, 0],
        "d": [0, 0, 33, 7,  0, 0, 0, 64, 33, 0, 0],
        "e": [0, 0, 0,  45, 0, 0, 0, 12, 11, 0, 0]
    },
    index=[4, 21, 30, 57, 59, 62, 71, 80, 81, 102, 126]
)
#print(tbl)
#       a  b   c   d   e
# 4     0  0   0   0   0
# 21    0  0   0   0   0
# 30   23  2  65  33   0
# 57    2  3  34   7  45
# 59    0  0   0   0   0
# 62    0  0   0   0   0
# 71    0  0   0   0   0
# 80   19  4  45  64  12
# 81    1  3   2  33  11
# 102   0  0   0   0   0
# 126   0  0   0   0   0

rez = pnd.dropMeaninglessRows(tbl)
#rez = pnd.dropMeaninglessRows(tbl, "a")
#print(rez)
#       a  b   c   d   e
# 4     0  0   0   0   0
# 21    0  0   0   0   0
# 30   23  2  65  33   0
# 57    2  3  34   7  45
# 59    0  0   0   0   0
# 71    0  0   0   0   0
# 80   19  4  45  64  12
# 81    1  3   2  33  11
# 102   0  0   0   0   0

rez = pnd.dropMeaninglessRows(tbl, "e")
#print(rez)
#       a  b   c   d   e
# 4     0  0   0   0   0
# 30   23  2  65  33   0
# 57    2  3  34   7  45
# 59    0  0   0   0   0
# 71    0  0   0   0   0
# 80   19  4  45  64  12
# 81    1  3   2  33  11
# 102   0  0   0   0   0