uio.utility.datasets.pandas
Data processing with Pandas tables.
It is important to import this module with an alias, for example:
from uio.utility.datasets import pandas as pnd
Otherwise it will collide with the pandas
module itself.
1""" 2Data processing with [Pandas](https://pandas.pydata.org) tables. 3 4It is important to import this module with an alias, for example: 5 6``` py 7from uio.utility.datasets import pandas as pnd 8``` 9 10Otherwise it will collide with the `pandas` module itself. 11""" 12 13import pandas 14 15from typing import Optional, List, Union 16 17from ..logs.log import logger 18 19 20def mergeTables(frames: List[pandas.DataFrame]) -> pandas.DataFrame: 21 """ 22 Merge (*concatenate*) several tables into one. Sorts the index 23 and verifies integrity (*will raise an exception 24 on duplicate/overlapping index*). 25 26 Example: 27 28 ``` py 29 from uio.utility.datasets import pandas as pnd 30 31 frames = [] 32 # ... 33 # frames.append(someTbl1) 34 # frames.append(someTbl2) 35 # frames.append(someTbl3) 36 # ... 37 tbl = pnd.mergeTables(frames) 38 #print(tbl.head(15)) 39 ``` 40 """ 41 42 mergedTable = pandas.concat( 43 frames, 44 verify_integrity=True 45 ).sort_index() 46 47 logger.debug(f"Total records in the resulting table: {len(mergedTable)}") 48 # logger.debug("Preview of the first rows:") 49 # logger.debug(mergedTable.head(15)) 50 51 return mergedTable 52 53 54def deduplicateTable( 55 tbl: pandas.DataFrame, 56 returnUniques: bool = True 57) -> pandas.DataFrame: 58 """ 59 Find duplicate rows in the table: 60 61 - with `returnUniques` set to `True`, returns a table without duplicate 62 rows. The first occurrence is always considered to be unique 63 in the resulting table, even if it has duplicates; 64 - with `returnUniques` set to `False` returns only duplicate rows. 65 66 Example: 67 68 ``` py 69 import pandas 70 from uio.utility.datasets import pandas as pnd 71 72 tbl = pandas.DataFrame( 73 [ 74 [1, 2, 3], 75 [4, 5, 6], 76 [1, 2, 3], 77 [7, 8, 9], 78 [4, 5, 6], 79 [3, 1, 2], 80 [4, 5, 6] 81 ], 82 index=[1, 2, 3, 4, 5, 6, 7], 83 columns=["a", "b", "c"] 84 ) 85 86 deduplicatedTable = pnd.deduplicateTable(tbl, True) 87 #print(deduplicatedTable) 88 # a b c 89 # 1 1 2 3 90 # 2 4 5 6 91 # 4 7 8 9 92 # 6 3 1 2 93 #print(deduplicatedTable.index.values) 94 # [1 2 4 6] 95 96 duplicateRows = pnd.deduplicateTable(tbl, False) 97 #print(duplicateRows) 98 # a b c 99 # 3 1 2 3 100 # 5 4 5 6 101 # 7 4 5 6 102 #print(duplicateRows.index.values) 103 # [3 5 7] 104 ``` 105 """ 106 duplicateRows = tbl.duplicated() 107 duplicateRows = duplicateRows[duplicateRows].index 108 109 logger.debug(f"Unique rows count: {len(tbl) - len(duplicateRows)}") 110 logger.debug(f"Duplicate rows count: {len(duplicateRows)}") 111 # logger.debug(f"Indexes of duplicate rows: {duplicateRows}") 112 113 duplicatesIndex = tbl.index.isin(duplicateRows) 114 115 return ( 116 tbl[~duplicatesIndex] if returnUniques 117 else tbl[duplicatesIndex] 118 ) 119 120 121def dropMeaninglessRows( 122 tbl: pandas.DataFrame, 123 # the types in Union can be extended with whatever else is supported 124 indicatorColumn: Optional[Union[str, int, float]] = None 125) -> pandas.DataFrame: 126 """ 127 Drop/remove meaningless rows from the table. For instance, if a table 128 represents some event timeline, in which there are periods 129 of activity/changes with periods of no changes between them, then it might 130 make sense to drop those rows. 131 132 At the same time, the meaningless rows just before and right after 133 meaningful rows should are not dropped/removed, as they show points of time 134 when activity started/ended. The first row is always kept and is used 135 as a baseline for comparison. 136 137 If `indicatorColumn` is provided, than that cell value from the baseline 138 row is used to determine meaningless rows. Otherwise, the entire rows 139 will be compared with the baseline one. 140 141 Example: 142 143 ``` py 144 import pandas 145 from uio.utility.datasets import pandas as pnd 146 147 tbl = pandas.DataFrame( 148 { 149 "a": [0, 0, 23, 2, 0, 0, 0, 19, 1, 0, 0], 150 "b": [0, 0, 2, 3, 0, 0, 0, 4, 3, 0, 0], 151 "c": [0, 0, 65, 34, 0, 0, 0, 45, 2, 0, 0], 152 "d": [0, 0, 33, 7, 0, 0, 0, 64, 33, 0, 0], 153 "e": [0, 0, 0, 45, 0, 0, 0, 12, 11, 0, 0] 154 }, 155 index=[4, 21, 30, 57, 59, 62, 71, 80, 81, 102, 126] 156 ) 157 #print(tbl) 158 # a b c d e 159 # 4 0 0 0 0 0 160 # 21 0 0 0 0 0 161 # 30 23 2 65 33 0 162 # 57 2 3 34 7 45 163 # 59 0 0 0 0 0 164 # 62 0 0 0 0 0 165 # 71 0 0 0 0 0 166 # 80 19 4 45 64 12 167 # 81 1 3 2 33 11 168 # 102 0 0 0 0 0 169 # 126 0 0 0 0 0 170 171 rez = pnd.dropMeaninglessRows(tbl) 172 #rez = pnd.dropMeaninglessRows(tbl, "a") 173 #print(rez) 174 # a b c d e 175 # 4 0 0 0 0 0 176 # 21 0 0 0 0 0 177 # 30 23 2 65 33 0 178 # 57 2 3 34 7 45 179 # 59 0 0 0 0 0 180 # 71 0 0 0 0 0 181 # 80 19 4 45 64 12 182 # 81 1 3 2 33 11 183 # 102 0 0 0 0 0 184 185 rez = pnd.dropMeaninglessRows(tbl, "e") 186 #print(rez) 187 # a b c d e 188 # 4 0 0 0 0 0 189 # 30 23 2 65 33 0 190 # 57 2 3 34 7 45 191 # 59 0 0 0 0 0 192 # 71 0 0 0 0 0 193 # 80 19 4 45 64 12 194 # 81 1 3 2 33 11 195 # 102 0 0 0 0 0 196 ``` 197 """ 198 logger.debug(f"Original table:\n{tbl}") 199 200 indexLength = len(tbl.index) 201 rowsToDelete = [] 202 if indicatorColumn: # it is enough to compare just one column values 203 if indicatorColumn not in tbl.columns: 204 raise ValueError(f"Table has no column [{indicatorColumn}]") 205 for i in range(1, indexLength): # first row is always kept 206 if ( 207 tbl.at[tbl.index[i], indicatorColumn] 208 == tbl.at[tbl.index[0], indicatorColumn] 209 and 210 tbl.at[tbl.index[i-1], indicatorColumn] 211 == tbl.at[tbl.index[0], indicatorColumn] 212 ): 213 if i < indexLength - 1: # not the last row 214 if ( 215 tbl.at[tbl.index[i+1], indicatorColumn] 216 == tbl.at[tbl.index[0], indicatorColumn] 217 ): 218 rowsToDelete.append(tbl.index[i]) 219 else: 220 rowsToDelete.append(tbl.index[i]) 221 else: # have to compare the entire rows 222 for i in range(1, indexLength): # first row is always kept 223 if ( 224 tbl.iloc[i].equals(tbl.iloc[0]) 225 and 226 tbl.iloc[i-1].equals(tbl.iloc[0]) 227 ): 228 if i < indexLength - 1: # not the last row 229 if tbl.iloc[i+1].equals(tbl.iloc[0]): 230 rowsToDelete.append(tbl.index[i]) 231 else: 232 rowsToDelete.append(tbl.index[i]) 233 234 logger.debug(f"Indexes of the rows to drop: {rowsToDelete}") 235 236 tbl = tbl.drop(index=rowsToDelete) 237 238 logger.debug(f"Table without meaningless rows:\n{tbl}") 239 240 return tbl
21def mergeTables(frames: List[pandas.DataFrame]) -> pandas.DataFrame: 22 """ 23 Merge (*concatenate*) several tables into one. Sorts the index 24 and verifies integrity (*will raise an exception 25 on duplicate/overlapping index*). 26 27 Example: 28 29 ``` py 30 from uio.utility.datasets import pandas as pnd 31 32 frames = [] 33 # ... 34 # frames.append(someTbl1) 35 # frames.append(someTbl2) 36 # frames.append(someTbl3) 37 # ... 38 tbl = pnd.mergeTables(frames) 39 #print(tbl.head(15)) 40 ``` 41 """ 42 43 mergedTable = pandas.concat( 44 frames, 45 verify_integrity=True 46 ).sort_index() 47 48 logger.debug(f"Total records in the resulting table: {len(mergedTable)}") 49 # logger.debug("Preview of the first rows:") 50 # logger.debug(mergedTable.head(15)) 51 52 return mergedTable
Merge (concatenate) several tables into one. Sorts the index and verifies integrity (will raise an exception on duplicate/overlapping index).
Example:
from uio.utility.datasets import pandas as pnd
frames = []
# ...
# frames.append(someTbl1)
# frames.append(someTbl2)
# frames.append(someTbl3)
# ...
tbl = pnd.mergeTables(frames)
#print(tbl.head(15))
55def deduplicateTable( 56 tbl: pandas.DataFrame, 57 returnUniques: bool = True 58) -> pandas.DataFrame: 59 """ 60 Find duplicate rows in the table: 61 62 - with `returnUniques` set to `True`, returns a table without duplicate 63 rows. The first occurrence is always considered to be unique 64 in the resulting table, even if it has duplicates; 65 - with `returnUniques` set to `False` returns only duplicate rows. 66 67 Example: 68 69 ``` py 70 import pandas 71 from uio.utility.datasets import pandas as pnd 72 73 tbl = pandas.DataFrame( 74 [ 75 [1, 2, 3], 76 [4, 5, 6], 77 [1, 2, 3], 78 [7, 8, 9], 79 [4, 5, 6], 80 [3, 1, 2], 81 [4, 5, 6] 82 ], 83 index=[1, 2, 3, 4, 5, 6, 7], 84 columns=["a", "b", "c"] 85 ) 86 87 deduplicatedTable = pnd.deduplicateTable(tbl, True) 88 #print(deduplicatedTable) 89 # a b c 90 # 1 1 2 3 91 # 2 4 5 6 92 # 4 7 8 9 93 # 6 3 1 2 94 #print(deduplicatedTable.index.values) 95 # [1 2 4 6] 96 97 duplicateRows = pnd.deduplicateTable(tbl, False) 98 #print(duplicateRows) 99 # a b c 100 # 3 1 2 3 101 # 5 4 5 6 102 # 7 4 5 6 103 #print(duplicateRows.index.values) 104 # [3 5 7] 105 ``` 106 """ 107 duplicateRows = tbl.duplicated() 108 duplicateRows = duplicateRows[duplicateRows].index 109 110 logger.debug(f"Unique rows count: {len(tbl) - len(duplicateRows)}") 111 logger.debug(f"Duplicate rows count: {len(duplicateRows)}") 112 # logger.debug(f"Indexes of duplicate rows: {duplicateRows}") 113 114 duplicatesIndex = tbl.index.isin(duplicateRows) 115 116 return ( 117 tbl[~duplicatesIndex] if returnUniques 118 else tbl[duplicatesIndex] 119 )
Find duplicate rows in the table:
- with
returnUniques
set toTrue
, returns a table without duplicate rows. The first occurrence is always considered to be unique in the resulting table, even if it has duplicates; - with
returnUniques
set toFalse
returns only duplicate rows.
Example:
import pandas
from uio.utility.datasets import pandas as pnd
tbl = pandas.DataFrame(
[
[1, 2, 3],
[4, 5, 6],
[1, 2, 3],
[7, 8, 9],
[4, 5, 6],
[3, 1, 2],
[4, 5, 6]
],
index=[1, 2, 3, 4, 5, 6, 7],
columns=["a", "b", "c"]
)
deduplicatedTable = pnd.deduplicateTable(tbl, True)
#print(deduplicatedTable)
# a b c
# 1 1 2 3
# 2 4 5 6
# 4 7 8 9
# 6 3 1 2
#print(deduplicatedTable.index.values)
# [1 2 4 6]
duplicateRows = pnd.deduplicateTable(tbl, False)
#print(duplicateRows)
# a b c
# 3 1 2 3
# 5 4 5 6
# 7 4 5 6
#print(duplicateRows.index.values)
# [3 5 7]
122def dropMeaninglessRows( 123 tbl: pandas.DataFrame, 124 # the types in Union can be extended with whatever else is supported 125 indicatorColumn: Optional[Union[str, int, float]] = None 126) -> pandas.DataFrame: 127 """ 128 Drop/remove meaningless rows from the table. For instance, if a table 129 represents some event timeline, in which there are periods 130 of activity/changes with periods of no changes between them, then it might 131 make sense to drop those rows. 132 133 At the same time, the meaningless rows just before and right after 134 meaningful rows should are not dropped/removed, as they show points of time 135 when activity started/ended. The first row is always kept and is used 136 as a baseline for comparison. 137 138 If `indicatorColumn` is provided, than that cell value from the baseline 139 row is used to determine meaningless rows. Otherwise, the entire rows 140 will be compared with the baseline one. 141 142 Example: 143 144 ``` py 145 import pandas 146 from uio.utility.datasets import pandas as pnd 147 148 tbl = pandas.DataFrame( 149 { 150 "a": [0, 0, 23, 2, 0, 0, 0, 19, 1, 0, 0], 151 "b": [0, 0, 2, 3, 0, 0, 0, 4, 3, 0, 0], 152 "c": [0, 0, 65, 34, 0, 0, 0, 45, 2, 0, 0], 153 "d": [0, 0, 33, 7, 0, 0, 0, 64, 33, 0, 0], 154 "e": [0, 0, 0, 45, 0, 0, 0, 12, 11, 0, 0] 155 }, 156 index=[4, 21, 30, 57, 59, 62, 71, 80, 81, 102, 126] 157 ) 158 #print(tbl) 159 # a b c d e 160 # 4 0 0 0 0 0 161 # 21 0 0 0 0 0 162 # 30 23 2 65 33 0 163 # 57 2 3 34 7 45 164 # 59 0 0 0 0 0 165 # 62 0 0 0 0 0 166 # 71 0 0 0 0 0 167 # 80 19 4 45 64 12 168 # 81 1 3 2 33 11 169 # 102 0 0 0 0 0 170 # 126 0 0 0 0 0 171 172 rez = pnd.dropMeaninglessRows(tbl) 173 #rez = pnd.dropMeaninglessRows(tbl, "a") 174 #print(rez) 175 # a b c d e 176 # 4 0 0 0 0 0 177 # 21 0 0 0 0 0 178 # 30 23 2 65 33 0 179 # 57 2 3 34 7 45 180 # 59 0 0 0 0 0 181 # 71 0 0 0 0 0 182 # 80 19 4 45 64 12 183 # 81 1 3 2 33 11 184 # 102 0 0 0 0 0 185 186 rez = pnd.dropMeaninglessRows(tbl, "e") 187 #print(rez) 188 # a b c d e 189 # 4 0 0 0 0 0 190 # 30 23 2 65 33 0 191 # 57 2 3 34 7 45 192 # 59 0 0 0 0 0 193 # 71 0 0 0 0 0 194 # 80 19 4 45 64 12 195 # 81 1 3 2 33 11 196 # 102 0 0 0 0 0 197 ``` 198 """ 199 logger.debug(f"Original table:\n{tbl}") 200 201 indexLength = len(tbl.index) 202 rowsToDelete = [] 203 if indicatorColumn: # it is enough to compare just one column values 204 if indicatorColumn not in tbl.columns: 205 raise ValueError(f"Table has no column [{indicatorColumn}]") 206 for i in range(1, indexLength): # first row is always kept 207 if ( 208 tbl.at[tbl.index[i], indicatorColumn] 209 == tbl.at[tbl.index[0], indicatorColumn] 210 and 211 tbl.at[tbl.index[i-1], indicatorColumn] 212 == tbl.at[tbl.index[0], indicatorColumn] 213 ): 214 if i < indexLength - 1: # not the last row 215 if ( 216 tbl.at[tbl.index[i+1], indicatorColumn] 217 == tbl.at[tbl.index[0], indicatorColumn] 218 ): 219 rowsToDelete.append(tbl.index[i]) 220 else: 221 rowsToDelete.append(tbl.index[i]) 222 else: # have to compare the entire rows 223 for i in range(1, indexLength): # first row is always kept 224 if ( 225 tbl.iloc[i].equals(tbl.iloc[0]) 226 and 227 tbl.iloc[i-1].equals(tbl.iloc[0]) 228 ): 229 if i < indexLength - 1: # not the last row 230 if tbl.iloc[i+1].equals(tbl.iloc[0]): 231 rowsToDelete.append(tbl.index[i]) 232 else: 233 rowsToDelete.append(tbl.index[i]) 234 235 logger.debug(f"Indexes of the rows to drop: {rowsToDelete}") 236 237 tbl = tbl.drop(index=rowsToDelete) 238 239 logger.debug(f"Table without meaningless rows:\n{tbl}") 240 241 return tbl
Drop/remove meaningless rows from the table. For instance, if a table represents some event timeline, in which there are periods of activity/changes with periods of no changes between them, then it might make sense to drop those rows.
At the same time, the meaningless rows just before and right after meaningful rows should are not dropped/removed, as they show points of time when activity started/ended. The first row is always kept and is used as a baseline for comparison.
If indicatorColumn
is provided, than that cell value from the baseline
row is used to determine meaningless rows. Otherwise, the entire rows
will be compared with the baseline one.
Example:
import pandas
from uio.utility.datasets import pandas as pnd
tbl = pandas.DataFrame(
{
"a": [0, 0, 23, 2, 0, 0, 0, 19, 1, 0, 0],
"b": [0, 0, 2, 3, 0, 0, 0, 4, 3, 0, 0],
"c": [0, 0, 65, 34, 0, 0, 0, 45, 2, 0, 0],
"d": [0, 0, 33, 7, 0, 0, 0, 64, 33, 0, 0],
"e": [0, 0, 0, 45, 0, 0, 0, 12, 11, 0, 0]
},
index=[4, 21, 30, 57, 59, 62, 71, 80, 81, 102, 126]
)
#print(tbl)
# a b c d e
# 4 0 0 0 0 0
# 21 0 0 0 0 0
# 30 23 2 65 33 0
# 57 2 3 34 7 45
# 59 0 0 0 0 0
# 62 0 0 0 0 0
# 71 0 0 0 0 0
# 80 19 4 45 64 12
# 81 1 3 2 33 11
# 102 0 0 0 0 0
# 126 0 0 0 0 0
rez = pnd.dropMeaninglessRows(tbl)
#rez = pnd.dropMeaninglessRows(tbl, "a")
#print(rez)
# a b c d e
# 4 0 0 0 0 0
# 21 0 0 0 0 0
# 30 23 2 65 33 0
# 57 2 3 34 7 45
# 59 0 0 0 0 0
# 71 0 0 0 0 0
# 80 19 4 45 64 12
# 81 1 3 2 33 11
# 102 0 0 0 0 0
rez = pnd.dropMeaninglessRows(tbl, "e")
#print(rez)
# a b c d e
# 4 0 0 0 0 0
# 30 23 2 65 33 0
# 57 2 3 34 7 45
# 59 0 0 0 0 0
# 71 0 0 0 0 0
# 80 19 4 45 64 12
# 81 1 3 2 33 11
# 102 0 0 0 0 0