Reference API

Alex Wu's utility functions for data processing, visualization, and reporting.

Metadata Description
Author Alexander Wu
Email alexander.wu7@gmail.com
Date Modified August 2023

DF(*args, **kwargs)

Create PyArrow Pandas DataFrame.

Returns:

Type Description
pd.Series

pd.DataFrame: pd.DataFrame with pyarrow data types

Source code in alexwu/alexwu.py
134
135
136
137
138
139
140
141
def DF(*args, **kwargs) -> pd.Series:
    """Create PyArrow Pandas DataFrame.

    Returns:
        pd.DataFrame: pd.DataFrame with pyarrow data types
    """
    df = pd.DataFrame(*args, **kwargs).convert_dtypes(dtype_backend='pyarrow')
    return df

S(*args, **kwargs)

Create PyArrow Pandas Series.

Returns:

Type Description
pd.Series

pd.Series: pd.Series with pyarrow data types

Source code in alexwu/alexwu.py
125
126
127
128
129
130
131
132
def S(*args, **kwargs) -> pd.Series:
    """Create PyArrow Pandas Series.

    Returns:
        pd.Series: pd.Series with pyarrow data types
    """
    df = pd.Series(*args, **kwargs).convert_dtypes(dtype_backend='pyarrow')
    return df

add_prefix(df, prefix, subset=None, regex=None)

Add prefix to columns

Parameters:

Name Type Description Default
df pd.DataFrame

Input dataframe

required
prefix str

prefix string to prepend

required
subset _type_

subset of columns to affect. Defaults to None.

None
regex _type_

regex filter of columns to affect. Defaults to None.

None

Returns:

Type Description
pd.DataFrame

pd.DataFrame: Transformed dataframe with certain columns prepended with prefix

Source code in alexwu/alexwu.py
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
def add_prefix(df: pd.DataFrame, prefix: str, subset=None, regex=None) -> pd.DataFrame:
    """Add prefix to columns

    Args:
        df (pd.DataFrame): Input dataframe
        prefix (str): prefix string to prepend
        subset (_type_, optional): subset of columns to affect. Defaults to None.
        regex (_type_, optional): regex filter of columns to affect. Defaults to None.

    Returns:
        pd.DataFrame: Transformed dataframe with certain columns prepended with prefix
    """
    cols = list(df.columns)
    if regex is not None:
        cols = list(df.columns.str.contains(regex))
    if isinstance(subset, str):
        subset = [subset]
    if hasattr(subset, '__contains__'):
        cols = [col for col in cols if col in subset]
    df_prefix = df.rename(columns={col: f'{prefix}{col}' for col in cols})
    return df_prefix

add_suffix(df, suffix, subset=None, regex=None)

Add suffix to columns

Parameters:

Name Type Description Default
df pd.DataFrame

Input dataframe

required
suffix str

suffix string to append

required
subset _type_

subset of columns to affect. Defaults to None.

None
regex _type_

regex filter of columns to affect. Defaults to None.

None

Returns:

Type Description
pd.DataFrame

pd.DataFrame: Transformed dataframe with certain columns appended with suffix

Source code in alexwu/alexwu.py
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
def add_suffix(df: pd.DataFrame, suffix: str, subset=None, regex=None) -> pd.DataFrame:
    """Add suffix to columns

    Args:
        df (pd.DataFrame): Input dataframe
        suffix (str): suffix string to append
        subset (_type_, optional): subset of columns to affect. Defaults to None.
        regex (_type_, optional): regex filter of columns to affect. Defaults to None.

    Returns:
        pd.DataFrame: Transformed dataframe with certain columns appended with suffix
    """
    cols = list(df.columns)
    if regex is not None:
        cols = list(df.columns.str.contains(regex))
    if isinstance(subset, str):
        subset = [subset]
    if hasattr(subset, '__contains__'):
        cols = [col for col in cols if col in subset]
    df_suffix = df.rename(columns={col: f'{col}{suffix}' for col in cols})
    return df_suffix

combo_sizes(set_list, set_names=None, vmax=None, sort=True)

Summarary table of set combinations sizes. Rows represent size of overlapping sets

Parameters:

Name Type Description Default
set_list list[set]

inputs sets

required
set_names _type_

Names corresponding to input sets. Defaults to None.

None
vmax _type_

Denominator for percentage. Defaults to # unique elements among all sets in set_list.

None
sort bool

Sort overlap percentages. Defaults to True.

True

Returns:

Type Description
pd.DataFrame

pd.DataFrame: size of overlapping sets

Source code in alexwu/alexwu.py
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
def combo_sizes(set_list: list[set], set_names=None, vmax=None, sort=True) -> pd.DataFrame:
    """Summarary table of set combinations sizes. Rows represent size of overlapping sets

    Args:
        set_list (list[set]): inputs sets
        set_names (_type_, optional): Names corresponding to input sets. Defaults to None.
        vmax (_type_, optional): Denominator for percentage. Defaults to # unique elements among all sets in set_list.
        sort (bool, optional): Sort overlap percentages. Defaults to True.

    Returns:
        pd.DataFrame: size of overlapping sets
    """
    if vmax is None:
        vmax = len(reduce(lambda x, y: x | y, set_list))
    if set_names is None:
        set_names = list(range(len(set_list)))
    combo_list = [()]
    sizes_list = [vmax]
    for k in range(1, len(set_list)+1):
        for indices_combo in itertools.combinations(enumerate(set_list), k):
            indices, combo = zip(*indices_combo)
            size = len(reduce(lambda x, y: x & y, combo))
            sizes_list.append(size)
            combo_list.append(indices)
    combo_df = pd.DataFrame([['Yes' if i in i_list else '-' for i in range(len(set_names))]
                             for i_list in combo_list], columns=set_names)
    combo_df['Size'] = sizes_list
    combo_df['%'] = 100*combo_df['Size'] / vmax
    if sort:
        combo_df = combo_df.sort_values('Size', ascending=False)
    combo_df.index += 1

    def highlight(s):
        return ['background-color: green' if v else '' for v in s == 'Yes']
    combo_df_styled = (combo_df
            .style.apply(highlight)
            .bar(color='#543b66', vmin=0, vmax=100, subset=['%'])
            .format(precision=1))
    return combo_df_styled

combo_sizes2(set_list, set_names=None, vmax=None, sort=True)

Summarary table of set combinations sizes (strict).

Rows represent size of overlapping sets only(which don't containing others).

Parameters:

Name Type Description Default
set_list list[set]

inputs sets

required
set_names _type_

Names corresponding to input sets. Defaults to None.

None
vmax _type_

Denominator for percentage. Defaults to # unique elements among all sets in set_list.

None
sort bool

Sort overlap percentages. Defaults to True.

True

Returns:

Type Description
pd.DataFrame

pd.DataFrame: size of overlapping sets (strict)

Source code in alexwu/alexwu.py
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
def combo_sizes2(set_list: list[set], set_names=None, vmax=None, sort=True) -> pd.DataFrame:
    """Summarary table of set combinations sizes (strict).

    Rows represent size of overlapping sets only(which don't containing others).

    Args:
        set_list (list[set]): inputs sets
        set_names (_type_, optional): Names corresponding to input sets. Defaults to None.
        vmax (_type_, optional): Denominator for percentage. Defaults to # unique elements among all sets in set_list.
        sort (bool, optional): Sort overlap percentages. Defaults to True.

    Returns:
        pd.DataFrame: size of overlapping sets (strict)
    """
    if vmax is None:
        vmax = len(reduce(lambda x, y: x | y, set_list))
    if set_names is None:
        set_names = list(range(len(set_list)))
    combo_list = [()]
    sizes_list = [vmax]

    for k in range(1, len(set_list)+1):
        for indices, combo, other_combo in zip(
            itertools.combinations(range(len(set_list)), k),
            itertools.combinations(set_list, k),
            list(itertools.combinations(set_list, len(set_list) - k))[::-1]
        ):
            row_vals = reduce(lambda x, y: x & y, combo)
            if other_combo:
                row_vals = row_vals - reduce(lambda x, y: x | y, other_combo)
            size = len(row_vals)
            sizes_list.append(size)
            combo_list.append(indices)

    combo_df = pd.DataFrame([
            # First row is union of all values
            ['-']*len(set_names),
            # All other rows proceed normally as combinations of 'Yes', 'No'
            *[['Yes' if i in i_list else 'No' for i in range(len(set_names))]
                for i_list in combo_list[1:]]
        ], columns=set_names)
    combo_df['Size'] = sizes_list
    combo_df['%'] = 100*combo_df['Size'] / vmax
    if sort:
        combo_df = combo_df.sort_values('Size', ascending=False)
    combo_df.index += 1

    def highlight(s):
        return ['background-color: green' if v == 'Yes' else 'background-color: darkred'
                if v == 'No' else '' for v in s]
    combo_df_styled = (combo_df
            .style.apply(highlight)
            .bar(color='#543b66', vmin=0, vmax=100, subset=['%'])
            .format(precision=1))
    return combo_df_styled

copy(text)

Copy text to clipboard.

Parameters:

Name Type Description Default
text str

text to copy to clipboard

required
Source code in alexwu/alexwu.py
183
184
185
186
187
188
189
190
191
192
193
194
def copy(text: str) -> None:
    """Copy text to clipboard.

    Args:
        text (str): text to copy to clipboard
    """
    # Source: https://stackoverflow.com/questions/11063458/python-script-to-copy-text-to-clipboard
    try:
        import pyperclip  # type: ignore
        pyperclip.copy(text)
    except ModuleNotFoundError:
        sys.stderr.write("Cannot copy. Try `pip install pyperclip`\n")

date2name(pd_series)

Convert to date to day of week.

Parameters:

Name Type Description Default
pd_series pd.Series

Input datetimes

required

Returns:

Type Description
pd.Series

pd.Series: Day of week names

Source code in alexwu/alexwu.py
220
221
222
223
224
225
226
227
228
229
230
231
def date2name(pd_series: pd.Series) -> pd.Series:
    """Convert to date to day of week.

    Args:
        pd_series (pd.Series): Input datetimes

    Returns:
        pd.Series: Day of week names
    """
    DAY_NAMES = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    day_name_series = pd.Categorical(pd_series.dt.day_name(), categories=DAY_NAMES)
    return day_name_series

df_overlaps(df1, df2, suffixes=('1', '2'))

Merge based on overlapping 'start', 'end' variables

Parameters:

Name Type Description Default
df1 pd.DataFrame

Input dataframe 1

required
df2 pd.DataFrame

Input dataframe 2

required
suffixes tuple

identify corresponding columns with this suffix. Defaults to ('1', '2').

('1', '2')

Returns:

Type Description
pd.DataFrame

pd.DataFrame: Merged dataframe (based on corresponding 'start' and 'end' of input dataframes)

Source code in alexwu/alexwu.py
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
def df_overlaps(df1: pd.DataFrame, df2: pd.DataFrame, suffixes=('1', '2')) -> pd.DataFrame:
    """Merge based on overlapping 'start', 'end' variables

    Args:
        df1 (pd.DataFrame): Input dataframe 1
        df2 (pd.DataFrame): Input dataframe 2
        suffixes (tuple, optional): identify corresponding columns with this suffix. Defaults to ('1', '2').

    Returns:
        pd.DataFrame: Merged dataframe (based on corresponding 'start' and 'end' of input dataframes)
    """
    assert 'start' in df1.columns and 'end' in df1.columns and 'i' not in df1.columns
    assert 'start' in df2.columns and 'end' in df2.columns and 'i' not in df2.columns
    assert df1['start'].is_monotonic_increasing & df1['end'].is_monotonic_increasing
    assert df2['start'].is_monotonic_increasing & df2['end'].is_monotonic_increasing
    assert all(df1['start'] <= df1['end'])
    assert all(df2['start'] <= df2['end'])
    df1 = df1.reset_index(names='i')
    df2 = df2.reset_index(names='i')
    X_interval = list(df2[['start', 'end']].itertuples(index=False, name=None))
    Y_interval = list(df1[['start', 'end']].itertuples(index=False, name=None))
    overlaps_list = overlaps(X_interval, Y_interval)
    index_list = [[y_i for y_i, _ in x_list] for x_list in overlaps_list]
    overlap_list = [[overlap for _, overlap in x_list] for x_list in overlaps_list]
    i1, _ = f'i{suffixes[0]}', f'i{suffixes[1]}'
    overlap_df = (df2.pipe(add_suffix, suffixes[1])
                  .assign(**{i1: index_list, 'overlap': overlap_list})
                  .explode([i1, 'overlap']))
    overlap_df = overlap_df.merge(df1.pipe(add_suffix, suffixes[0]), on=i1, how='outer')
    return overlap_df

dirr(arg, like=None)

Displays dir(arg) but with more details and formatted as DataFrame.

Parameters:

Name Type Description Default
arg Any

python object

required
like str

filter string. Defaults to None.

None
Source code in alexwu/alexwu.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def dirr(arg: Any, like: str=None) -> pd.DataFrame:
    """Displays dir(arg) but with more details and formatted as DataFrame.

    Args:
        arg (Any): python object
        like (str, optional): filter string. Defaults to None.
    """
    def get_attr(arg: Any, x: str) -> str:
        with warnings.catch_warnings():
            warnings.simplefilter("error")
            try:
                return getattr(arg, x)
            except AttributeError:
                return '!'
    print(type(arg))
    dirr_list = [x for x in dir(arg) if not x.startswith('_')]
    dirr_df = pd.DataFrame({'attr': dirr_list})
    dirr_df['type'] = [type(get_attr(arg, x)) for x in dirr_list]
    if like is not None:
        dirr_df = dirr_df[dirr_df['attr'].str.contains(like)]
    dirr_df['doc'] = [get_attr(arg, attr).__doc__ if str(tt) == "<class 'method'>" else ''
                      for attr, tt in zip(dirr_df['attr'], dirr_df['type'])]
    dirr_df['doc'] = dirr_df['doc'].astype(str).str.split(r'\.\n').str[0].str.strip()
    dirr_df['doc'] = [get_attr(arg, attr) if str(tt) != "<class 'method'>" else doc
                      for attr, tt, doc in zip(dirr_df['attr'], dirr_df['type'], dirr_df['doc'])]
    return dirr_df

disp(df, caption='', k=2, na_rep='-')

(For Jupyter) Prints newlines instead of '\n' characters for easier reading. Optionally, you can label dataframes with caption and round numbers

Parameters:

Name Type Description Default
df pd.DataFrame

Input DataFrame

required
caption str

Caption for df. Defaults to ''.

''
k int

Round to k digits. Defaults to 2.

2
na_rep str

Str representation of NA values. Defaults to '-'.

'-'

Returns:

Type Description
pd.DataFrame

pd.DataFrame: Styled DataFrame with optional caption

Source code in alexwu/alexwu.py
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
def disp(df: pd.DataFrame, caption='', k=2, na_rep='-') -> pd.DataFrame:
    """(For Jupyter) Prints newlines instead of '\\\\n' characters for easier reading.
    Optionally, you can label dataframes with caption and round numbers

    Args:
        df (pd.DataFrame): Input DataFrame
        caption (str, optional): Caption for df. Defaults to ''.
        k (int, optional): Round to k digits. Defaults to 2.
        na_rep (str, optional): Str representation of NA values. Defaults to '-'.

    Returns:
        pd.DataFrame: Styled DataFrame with optional caption
    """
    assert isnotebook()
    # Ensure row names and column names are unique
    df = df.pipe(df_enumerate)
    df = df.style if hasattr(df, 'style') else df
    df_captioned = (df.format(lambda x: str_round(x, k=k),
                              na_rep=na_rep,
                              subset=df.data.select_dtypes(exclude=object).columns)
                      .set_properties(**{'white-space': 'pre-wrap', 'text-align': 'left'})
                      .set_table_attributes("style='display:inline'")
                      .set_caption(caption))
    return df_captioned

display100(df, ii=10, N=100, na_rep=None)

Display N elements with ii elements each column

Parameters:

Name Type Description Default
df pd.Series, pd.DataFrame

Input values

required
ii int

elements in column. Defaults to 10.

10
N int

elements to display. Defaults to 100.

100
na_rep _type_

Missing str representation. Defaults to None.

None
Source code in alexwu/alexwu.py
597
598
599
600
601
602
603
604
605
606
607
608
609
def display100(df: pd.Series | pd.DataFrame, ii=10, N=100, na_rep=None) -> None:
    """Display N elements with ii elements each column

    Args:
        df (pd.Series, pd.DataFrame): Input values
        ii (int, optional): # elements in column. Defaults to 10.
        N (int, optional): # elements to display. Defaults to 100.
        na_rep (_type_, optional): Missing str representation. Defaults to None.
    """
    if isinstance(df, pd.Series):
        df = df.to_frame()
    N = min(N, len(df))
    displays(*[df.iloc[a:b] for a, b in pairwise(range(0, N+ii, ii))], na_rep=na_rep)

displays(*args, captions=None, k=2, na_rep='-')

(For Jupyter)
Display tables side by side to save vertical space.
Prints newlines instead of '

' characters for easier reading. Optionally, you can label dataframes with captions

Input:
    args: list of pandas.DataFrame
    captions: list of table captions
Source code in alexwu/alexwu.py
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
def displays(*args, captions: list[str] = None, k=2, na_rep='-'):
    """
    (For Jupyter)
    Display tables side by side to save vertical space.
    Prints newlines instead of '\n' characters for easier reading.
    Optionally, you can label dataframes with captions

    Input:
        args: list of pandas.DataFrame
        captions: list of table captions
    """
    assert isnotebook()
    if captions is None:
        captions = []
    if isinstance(captions, str):
        captions = [captions]
    if k is None:
        k = []

    args = (*args, pd.DataFrame())
    args = [arg.to_frame().style.hide_index() if isinstance(arg, pd.Series) else arg for arg in args]
    k_list = [k]*len(args) if isinstance(k, int) else k
    k_list.extend([None] * (len(args) - len(k_list)))
    captions.extend([''] * (len(args) - len(captions)))
    captioned_tables = [df.pipe(disp, caption, k, na_rep)._repr_html_()
                        for caption, df, k in zip(captions, args, k_list)]
    display(HTML('\xa0\xa0\xa0'.join(captioned_tables)))

get_sessions(pd_series, diff=pd.Timedelta(30, 'min'))

Group elements into "sessions".

Compute groups (sessions) chained together by diff units. Assumes pd_series is sorted.

Parameters:

Name Type Description Default
pd_series pd.Series

Input values

required
diff _type_

Maximum difference between first element and last element. Defaults to pd.Timedelta(30, 'min').

pd.Timedelta(30, 'min')

Returns:

Type Description
pd.Series

pd.Series: Grouped elements

Source code in alexwu/alexwu.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
def get_sessions(pd_series: pd.Series, diff=pd.Timedelta(30, 'min')) -> pd.Series:
    """Group elements into "sessions".

    Compute groups (sessions) chained together by `diff` units. Assumes pd_series is sorted.

    Args:
        pd_series (pd.Series): Input values
        diff (_type_, optional): Maximum difference between first element and last element.
            Defaults to pd.Timedelta(30, 'min').

    Returns:
        pd.Series: Grouped elements
    """
    assert pd_series.is_monotonic_increasing

    current_session = pd_series.iloc[0]
    sessions = [current_session]

    for item in pd_series.iloc[1:]:
        if sessions[-1] + diff <= item:
            current_session = item
        sessions.append(current_session)
    return pd.Series(sessions)

grouper(iterable, n, fillvalue=None)

Collect data into fixed-length chunks or blocks.

Parameters:

Name Type Description Default
iterable Iterable

Iterable to group into chunks

required
n int

size of chunks

required
fillvalue Any

Missing elements to fill in. Defaults to None.

None

Returns:

Name Type Description
Iterable Iterable

fixed-length chunks or blocks

Source code in alexwu/alexwu.py
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
def grouper(iterable: Iterable, n: int, fillvalue: Any=None) -> Iterable:
    """Collect data into fixed-length chunks or blocks.

    Args:
        iterable (Iterable): Iterable to group into chunks
        n (int): size of chunks
        fillvalue (Any, optional): Missing elements to fill in. Defaults to None.

    Returns:
        Iterable: fixed-length chunks or blocks
    """
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return itertools.zip_longest(*args, fillvalue=fillvalue)

highlight(df, v, color='DarkSlateGray', subset=None)

summary

Parameters:

Name Type Description Default
df pd.DataFrame

description

required
v Any

description

required
color str

description. Defaults to 'DarkSlateGray'.

'DarkSlateGray'
subset _type_

description. Defaults to None.

None

Returns:

Name Type Description
_type_

description

Example usage:

df.pipe(highlight, 1, color='k')

Source code in alexwu/alexwu.py
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
def highlight(df: pd.DataFrame, v: Any, color='DarkSlateGray', subset=None):
    """_summary_

    Args:
        df (pd.DataFrame): _description_
        v (Any): _description_
        color (str, optional): _description_. Defaults to 'DarkSlateGray'.
        subset (_type_, optional): _description_. Defaults to None.

    Returns:
        _type_: _description_

        Example usage:

        df.pipe(highlight, 1, color='k')
    """
    if hasattr(df, 'style'):
        df = df.style
    color = {'w': 'white', 'k': 'black'}.get(color, color)
    return df.applymap(lambda x: f'background-color: {color}' if x == v else '', subset=subset)

isnotebook()

Detect if code is running in Jupyter Notebook

Returns:

Name Type Description
bool bool

True = code is running in Jupyter Notebook

Source code in alexwu/alexwu.py
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
def isnotebook() -> bool:
    """Detect if code is running in Jupyter Notebook

    Returns:
        bool: True = code is running in Jupyter Notebook
    """
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            return True   # Jupyter notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            return False  # Terminal running IPython
        else:
            return False  # Other type (?)
    except NameError:
        return False      # Probably standard Python interpreter

ls(path='.', resolve=False)

View contents of ls command as DataFrame.

Parameters:

Name Type Description Default
path Path | str

Path name to ls. Defaults to '.'.

'.'
resolve bool

Resolve to absolute path. Defaults to False.

False

Raises:

Type Description
ValueError

Invalid path name

Returns:

Type Description
pd.DataFrame

pd.DataFrame: contents of ls

Source code in alexwu/alexwu.py
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def ls(path: Path | str = '.', resolve=False) -> pd.DataFrame:
    """View contents of `ls` command as DataFrame.

    Args:
        path (Path | str, optional): Path name to `ls`. Defaults to '.'.
        resolve (bool, optional): Resolve to absolute path. Defaults to False.

    Raises:
        ValueError: Invalid path name

    Returns:
        pd.DataFrame: contents of `ls`
    """
    match path:
        case Path():
            pass
        case '~':
            path = Path.home()
        case str():
            path = Path(path)
        case _:
            raise ValueError('invalid path')
    if resolve:
        path = path.resolve()
    df = DF({path: path.iterdir()})
    df.index += 1
    def g(self, row=1):
        return self.loc[row].iloc[0]
    def open(self, row=None):
        import subprocess
        from pathlib import PureWindowsPath
        posix_path = self.loc[row].iloc[0].resolve() if row is not None else PureWindowsPath(path.resolve())
        windows_path = PureWindowsPath(posix_path)
        subprocess.run(['explorer.exe', windows_path])
    df.g = g.__get__(df)
    df.open = open.__get__(df)
    return df

mkdir(path, **kwargs)

Make directory.

Parameters:

Name Type Description Default
path Path | str

Directory to create

required

Raises:

Type Description
ValueError

Invalid path name

Source code in alexwu/alexwu.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def mkdir(path: Path | str, **kwargs) -> pd.DataFrame:
    """Make directory.

    Args:
        path (Path | str): Directory to create

    Raises:
        ValueError: Invalid path name
    """
    match path:
        case Path():
            pass
        case str():
            path = Path(path)
        case _:
            raise ValueError('invalid path')
    if 'parents' not in kwargs:
        kwargs['parents'] = True
    if 'exist_ok' not in kwargs:
        kwargs['exist_ok'] = True
    path.mkdir(**kwargs)

overlaps(x_interval, y_interval)

Compute overlaps. Assumes non_overlapping_monotonic_increasing.

There are 9 ways Y_interval and overlap with X_interval

|   XXX   | X_interval   | ( 3,   5 ) |                                       |
|---------|--------------|------------|---------------------------------------|
| _YY_    | left         | (_2 , _4 ) | (x_begin < y_begin) & (x_end < y_end) |
| _YYYY   | left_spill   | (_2 ,  5 ) | (x_begin < y_begin) & (x_end = y_end) |
| _YYYYY_ | superset     | (_2 ,  6_) | (x_begin < y_begin) & (x_end > y_end) |
|   Y_    | left_subset  | ( 3 , _4 ) | (x_begin = y_begin) & (x_end < y_end) |
|   YYY   | equal        | ( 3 ,  5 ) | (x_begin = y_begin) & (x_end = y_end) |
|   YYYY_ | right_spill  | ( 3 ,  6_) | (x_begin = y_begin) & (x_end > y_end) |
|    _    | subset       | (_4_, _4_) | (x_begin > y_begin) & (x_end < y_end) |
|    _Y   | right_subset | (_4 ,  5 ) | (x_begin > y_begin) & (x_end = y_end) |
|    _YY_ | right        | (_4 ,  6_) | (x_begin > y_begin) & (x_end > y_end) |
|---------|--------------|------------|---------------------------------------|
| __      | no_overlap   | (_2 ,  6_) | (x_begin > y_end)                     |
|      __ | no_overlap   | (_2 ,  6_) | (x_end   < y_begin)                   |
|---------|--------------|------------|---------------------------------------|
| 1234567 |              |            |                                       |
Source code in alexwu/alexwu.py
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
def overlaps(x_interval, y_interval) -> list:
    # TODO: Optimize O(XY) to O(X + Y) algo
    '''Compute overlaps. Assumes non_overlapping_monotonic_increasing.

    There are 9 ways Y_interval and overlap with X_interval

        |   XXX   | X_interval   | ( 3,   5 ) |                                       |
        |---------|--------------|------------|---------------------------------------|
        | _YY_    | left         | (_2 , _4 ) | (x_begin < y_begin) & (x_end < y_end) |
        | _YYYY   | left_spill   | (_2 ,  5 ) | (x_begin < y_begin) & (x_end = y_end) |
        | _YYYYY_ | superset     | (_2 ,  6_) | (x_begin < y_begin) & (x_end > y_end) |
        |   Y_    | left_subset  | ( 3 , _4 ) | (x_begin = y_begin) & (x_end < y_end) |
        |   YYY   | equal        | ( 3 ,  5 ) | (x_begin = y_begin) & (x_end = y_end) |
        |   YYYY_ | right_spill  | ( 3 ,  6_) | (x_begin = y_begin) & (x_end > y_end) |
        |    _    | subset       | (_4_, _4_) | (x_begin > y_begin) & (x_end < y_end) |
        |    _Y   | right_subset | (_4 ,  5 ) | (x_begin > y_begin) & (x_end = y_end) |
        |    _YY_ | right        | (_4 ,  6_) | (x_begin > y_begin) & (x_end > y_end) |
        |---------|--------------|------------|---------------------------------------|
        | __      | no_overlap   | (_2 ,  6_) | (x_begin > y_end)                     |
        |      __ | no_overlap   | (_2 ,  6_) | (x_end   < y_begin)                   |
        |---------|--------------|------------|---------------------------------------|
        | 1234567 |              |            |                                       |
    '''
    overlaps_list = []
    for _, (x_begin, x_end) in enumerate(x_interval):
        x_overlap_list = []
        for y_i, (y_begin, y_end) in enumerate(y_interval):
            # Case: no_overlap
            #if x_begin > y_end or x_end < y_begin:
            if x_begin >= y_end or x_end <= y_begin:
                continue

            begin_order = '>' if x_begin < y_begin else '<' if x_begin > y_begin else '='
            end_order = '>' if x_end < y_end else '<' if x_end > y_end else '='
            overlap_str = f'{begin_order}{end_order}'
            overlap_tuple = (y_i, overlap_str)
            x_overlap_list.append(overlap_tuple)

        overlaps_list.append(x_overlap_list)
    return overlaps_list

pairwise(iterable)

Groups elements pairwise: s -> (s0,s1), (s1,s2), (s2, s3), ...

Parameters:

Name Type Description Default
iterable Iterable

Input iterable

required

Returns:

Name Type Description
Iterable Iterable

zipped iterable

Source code in alexwu/alexwu.py
584
585
586
587
588
589
590
591
592
593
594
595
def pairwise(iterable: Iterable) -> Iterable:
    """Groups elements pairwise: s -> (s0,s1), (s1,s2), (s2, s3), ...

    Args:
        iterable (Iterable): Input iterable

    Returns:
        Iterable: zipped iterable
    """
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

percent(pd_series, caption='', display_false=False)

Display percentage

Parameters:

Name Type Description Default
pd_series pd.Series

Input values

required
caption str

Caption. Defaults to ''.

''
display_false bool

Display percentage of False values. Defaults to False.

False

Returns:

Type Description
pd.DataFrame

pd.DataFrame: Displayed percentage

Source code in alexwu/alexwu.py
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
def percent(pd_series: pd.Series, caption='', display_false=False) -> pd.DataFrame:
    """Display percentage

    Args:
        pd_series (pd.Series): Input values
        caption (str, optional): Caption. Defaults to ''.
        display_false (bool, optional): Display percentage of False values. Defaults to False.

    Returns:
        pd.DataFrame: Displayed percentage
    """
    df = pd.value_counts(pd_series).to_frame().T
    if True not in df:
        df[True] = 0
    if False not in df:
        df[False] = 0
    df['Total'] = len(pd_series)
    df['%'] = 100*df[True] / df['Total']
    if not display_false:
        df = df.rename(columns={True: 'N'})
        df = df.drop(columns=[False])
    styled_df = (df.style.hide()
            .bar(vmin=0, vmax=100, color='#543b66', subset=['%'])
            .format('{:,.1f}', subset=['%']))
    if caption:
        styled_df = styled_df.set_caption(caption)
    return styled_df

read_file(filename, overwrite=False, base='data', verbose=True, **kwargs)

Read serialized file, caching the result

Parameters:

Name Type Description Default
filename Path | str

File to read in

required
overwrite bool

Overwrite cache. Defaults to False.

False
base str

Base filepath. Defaults to 'data'.

'data'
verbose bool

Print what's going on. Defaults to True.

True

Returns:

Type Description
pd.DataFrame

pd.DataFrame: Input file as DataFrame

Source code in alexwu/alexwu.py
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
def read_file(filename: Path | str, overwrite=False, base='data', verbose=True, **kwargs) -> pd.DataFrame:
    """Read serialized file, caching the result

    Args:
        filename (Path | str): File to read in
        overwrite (bool, optional): Overwrite cache. Defaults to False.
        base (str, optional): Base filepath. Defaults to 'data'.
        verbose (bool, optional): Print what's going on. Defaults to True.

    Returns:
        pd.DataFrame: Input file as DataFrame
    """
    if overwrite:
        return _read_file.__wrapped__(filename, base=base, verbose=verbose, **kwargs)
    return _read_file(filename, base=base, verbose=verbose, **kwargs)

reload(copy_clipboard=False)

Prints Jupyter's autoreload magic function.

Parameters:

Name Type Description Default
copy_clipboard bool

Copy contents to clipboard (does not work via remote-ssh). Defaults to False.

False
Source code in alexwu/alexwu.py
27
28
29
30
31
32
33
34
35
def reload(copy_clipboard=False) -> None:
    """Prints Jupyter's autoreload magic function.

    Args:
        copy_clipboard (bool, optional): Copy contents to clipboard (does not work via remote-ssh). Defaults to False.
    """
    if copy_clipboard:
        copy('%load_ext autoreload\n%autoreload 2')
    print('%load_ext autoreload\n%autoreload 2')

report(func, *df_args, compare=False, concat=False, k_list=2, fillna=False, **kwargs)

Report descriptive statistics.

Parameters:

Name Type Description Default
func Callable

Function to map

required
compare bool

Compare pairwise dataframes. Defaults to False.

False
concat bool

Concatenate descriptive columns. Defaults to False.

False
k_list int

Round to k digits. Defaults to 2.

2
fillna bool

NA str representation. Defaults to False.

False

Returns:

Type Description
pd.DataFrame

pd.DataFrame: Styled DataFrame(s)

Source code in alexwu/alexwu.py
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
@ignore_warnings
def report(func: Callable | list | pd.DataFrame, *df_args,
           compare=False,concat=False,k_list=2, fillna=False, **kwargs) -> pd.DataFrame:
    """Report descriptive statistics.

    Args:
        func (Callable): Function to map
        compare (bool, optional): Compare pairwise dataframes. Defaults to False.
        concat (bool, optional): Concatenate descriptive columns. Defaults to False.
        k_list (int, optional): Round to k digits. Defaults to 2.
        fillna (bool, optional): NA str representation. Defaults to False.

    Returns:
        pd.DataFrame: Styled DataFrame(s)
    """
    if isinstance(func, list):
        assert len(df_args) > 0
        cols = func
        def func(df):
            return report_rows_df(df, cols=cols, fillna=fillna, **kwargs)
    if not callable(func):
        df_args = [func, *df_args]
        cols = None
        def func(df):
            return report_rows_df(df, cols=cols, fillna=fillna, **kwargs)

    if compare and len(df_args) % 2 == 1:
        df_args = (*df_args[:-1], df_args[0], df_args[-1])
    if compare:
        if cols is None:
            cols = list(df_args[0].columns)
        compared_dfs = [(func(df1), func(df2), compare_dfs(df2, df1, cols, fillna=fillna))
                        for df1, df2 in grouper(df_args, 2)]
        result_dfs = list(itertools.chain(*compared_dfs))
    else:
        result_dfs = [func(df) for df in df_args]

    if k_list is None:
        k_list = [None] * len(result_dfs)
    if isinstance(k_list, int):
        k_list = [k_list] * len(result_dfs)
    k_list = [None, *k_list]

    if concat:
        result_dfs = [df.data if hasattr(df, 'data') else df for df in result_dfs]
        # result_dfs = [df.applymap(lambda x: str_round(x, k=k)) for df, k in zip(result_dfs, k_list[1:])]
        ## Ensure only one new-line per empty row
        result_dfs = [result_dfs[0]] + [df.replace('\n', '') for df in result_dfs[1:]]
        result_dfs = pd.concat(result_dfs, axis=1).pipe(df_enumerate)
        result_dfs.pipe(df_index, verbose=True, k=k_list)
    else:
        index_df = result_dfs[0].pipe(df_index)
        displays(index_df, *[df.style.hide() if hasattr(df, 'style') else df.hide()
                             for df in result_dfs], k=k_list)

report_categorical(pd_series, dropna=True, style=True)

Report descriptive stats of categorical variable.

Parameters:

Name Type Description Default
pd_series pd.Series

Input values

required
dropna bool

Drop missing elements. Defaults to True.

True
style bool

Style output. Defaults to True.

True

Returns:

Name Type Description
dict dict

Descriptive stats

Source code in alexwu/alexwu.py
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
def report_categorical(pd_series: pd.Series, dropna=True, style=True) -> dict:
    """Report descriptive stats of categorical variable.

    Args:
        pd_series (pd.Series): Input values
        dropna (bool, optional): Drop missing elements. Defaults to True.
        style (bool, optional): Style output. Defaults to True.

    Returns:
        dict: Descriptive stats
    """
    if pd_series.dtype.name == 'bool':
        pd_series = pd.Categorical(pd_series, categories=[True, False]).rename_categories({True: 'Yes', False: 'No'})
    vcounts = pd.value_counts(pd_series, sort=False, dropna=dropna)
    vcount_dict = dict(zip(vcounts.index.to_list(), [[x, '', ''] for x in vcounts]))
    if not dropna:
        vcount_dict[np.nan] = vcount_dict.get(np.nan, [0, '', ''])
        vcount_dict['(N/A)'] = vcount_dict.pop(np.nan)

    if style:
        vsum = vcounts.sum()
        vcount_dict = {k: [v[0], f'({100*v[0]/vsum:.1f}%)', f'{100*v[0]/vsum:.1f}%']
                    for k, v in vcount_dict.items()}
    return vcount_dict

report_numerical(pd_series, name='', k=2, proportiontocut=0, fillna=False, N=True)

Report descriptive statis of numerical variable.

Parameters:

Name Type Description Default
pd_series pd.Series

Input values

required
name str

Name of variable. Defaults to ''.

''
k int

round to k digits. Defaults to 2.

2
proportiontocut int

Trimmed propertion (from 0 to 1). Defaults to 0.

0
fillna bool

Filled value. Defaults to False.

False
N bool

Proportion elements complete (i.e. not missing). Defaults to True.

True

Returns:

Name Type Description
dict dict

description

Source code in alexwu/alexwu.py
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
def report_numerical(pd_series: pd.Series, name='', k=2, proportiontocut=0, fillna=False, N=True) -> dict:
    """Report descriptive statis of numerical variable.

    Args:
        pd_series (pd.Series): Input values
        name (str, optional): Name of variable. Defaults to ''.
        k (int, optional): round to k digits. Defaults to 2.
        proportiontocut (int, optional): Trimmed propertion (from 0 to 1). Defaults to 0.
        fillna (bool, optional): Filled value. Defaults to False.
        N (bool, optional): Proportion elements complete (i.e. not missing). Defaults to True.

    Returns:
        dict: _description_
    """
    if proportiontocut > 0:
        pd_series = trim(pd_series, proportiontocut)
    report_numerical_dict = {}
    if N:
        _row0 = [pd_series.notna().sum(),
                 f'({100*pd_series.notna().mean():.1f}%)',
                 f'{100*pd_series.notna().mean():.1f}%']
        report_numerical_dict[f'{name} (N)'] = _row0
    if fillna:
        pd_series = pd_series.fillna(0)
    if isinstance(pd_series, pd.Series) and pd_series.dtype == 'datetime64[ns]':
        _row1 = [pd_series.mean().date(),
                 f'{pd_series.std().days} days',
                 '-']
        _row2 = [pd_series.median().date(),
                 q1q3(pd_series, is_date=True),
                 bracket_str(pd_series.min(), pd_series.max(), is_date=True)]
        report_numerical_dict[f'{name} (mean, SD)'] = _row1
        report_numerical_dict[f'{name} (median, [Q1, Q3], [min, max])'] = _row2
    else:
        mean, ci_left, ci_right = mean_confidence_interval(pd_series)
        _row1 = [str_round(mean, k),
                 str_round(pd_series.std(), k),
                 bracket_str(ci_left, ci_right, k=k)]
        _row2 = [str_round(pd_series.median(), k),
                 q1q3(pd_series, k=k),
                 bracket_str(pd_series.min(), pd_series.max(), k=k)]
        report_numerical_dict[f'{name} (mean, SD, 95% CI)'] = _row1
        report_numerical_dict[f'{name} (median, [Q1, Q3], [min, max])'] = _row2
    return report_numerical_dict

report_rows(df, cols=None, dropna=False, k=2, proportiontocut=0, fillna=False, style=True)

Report descriptive statistics as a dict.

Parameters:

Name Type Description Default
df pd.DataFrame

Input DataFrame

required
cols str | list[str]

column names. Defaults to None.

None
dropna bool

Whether to drop missing rows. Defaults to False.

False
k int

Round to k digits. Defaults to 2.

2
proportiontocut int

Trimmed propertion (from 0 to 1). Defaults to 0.

0
fillna bool

Filled value. Defaults to False.

False
style bool

Style dataframe. Defaults to True.

True

Returns:

Name Type Description
dict dict

description statistics

Source code in alexwu/alexwu.py
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
def report_rows(df: pd.DataFrame, cols: str | list[str]=None,
                dropna=False, k=2, proportiontocut=0, fillna=False, style=True) -> dict:
    """Report descriptive statistics as a dict.

    Args:
        df (pd.DataFrame): Input DataFrame
        cols (str | list[str], optional): column names. Defaults to None.
        dropna (bool, optional): Whether to drop missing rows. Defaults to False.
        k (int, optional): Round to k digits. Defaults to 2.
        proportiontocut (int, optional): Trimmed propertion (from 0 to 1). Defaults to 0.
        fillna (bool, optional): Filled value. Defaults to False.
        style (bool, optional): Style dataframe. Defaults to True.

    Returns:
        dict: description statistics
    """
    rows = {}
    if isinstance(df, list):
        rows_list = [report_rows(d) for d in df]
        return {k: v for x in rows_list for k, v in x.items()}
    if isinstance(df, pd.Series):
        df = df.to_frame(name='')
    if cols is None:
        cols = list(df.columns)
    for col in cols:
        if col.startswith('-'):
            rows[('-', col[1:])] = ['\n', '', '']
        elif df[col].dtype.name == 'object':
            rows[(col, '(N, uniq)')] = [df[col].notna().sum(), df[col].nunique(), '']
        elif df[col].dtype.name in ('bool', 'category'):
            rows[(col, f'{col} N (%)')] = ['\n', '', '']
            rows.update({(col, k): v for k, v in report_categorical(df[col], dropna=dropna, style=style).items()})
        else:
            # rows[(col, '-')] = ['\n', '', '']
            _items = report_numerical(df[col], k=k, proportiontocut=proportiontocut, fillna=fillna).items()
            rows.update({(col, k): v for k, v in _items})
    return rows

report_rows_df(df, cols=None, dropna=False, k=2, proportiontocut=0, fillna=False, style=True)

Report descriptive statistics.

Parameters:

Name Type Description Default
df pd.DataFrame

Input DataFrame

required
cols str | list[str]

column names. Defaults to None.

None
dropna bool

Whether to drop missing rows. Defaults to False.

False
k int

Round to k digits. Defaults to 2.

2
proportiontocut int

Trimmed propertion (from 0 to 1). Defaults to 0.

0
fillna bool

Filled value. Defaults to False.

False
style bool

Style dataframe. Defaults to True.

True

Returns:

Type Description
pd.DataFrame

pd.DataFrame: Descriptive statistics

Source code in alexwu/alexwu.py
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
def report_rows_df(df: pd.DataFrame, cols: str | list[str]=None,
                   dropna=False, k=2, proportiontocut=0, fillna=False, style=True) -> pd.DataFrame:
    """Report descriptive statistics.

    Args:
        df (pd.DataFrame): Input DataFrame
        cols (str | list[str], optional): column names. Defaults to None.
        dropna (bool, optional): Whether to drop missing rows. Defaults to False.
        k (int, optional): Round to k digits. Defaults to 2.
        proportiontocut (int, optional): Trimmed propertion (from 0 to 1). Defaults to 0.
        fillna (bool, optional): Filled value. Defaults to False.
        style (bool, optional): Style dataframe. Defaults to True.

    Returns:
        pd.DataFrame: Descriptive statistics
    """
    INDEX_COLS = ['N', '%/SD/IQR', '95% CI/Range']
    res = pd.DataFrame(
        report_rows(df, cols, dropna=dropna, k=k, proportiontocut=proportiontocut, fillna=fillna, style=style),
        index=INDEX_COLS
    ).T
    if style:
        def bar_percent(x, color='#543b66'):
            if str(x).endswith('%'):
                x = float(x[:-1])
                return (f'background: linear-gradient(90deg, {color} {x}%, transparent {x}%); '
                        'width: 10em; color: rgba(0,0,0,0);')
        return res.style.applymap(bar_percent, color='steelblue', subset=['95% CI/Range'])
    return res

rm_file(filename, base='data', verbose=True)

Remove file

Parameters:

Name Type Description Default
filename Path | str

file to remove

required
base str

Base filepath. Defaults to 'data'.

'data'
verbose bool

Print what's going on. Defaults to True.

True

Raises:

Type Description
ValueError

Invalid filename

Source code in alexwu/alexwu.py
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
def rm_file(filename: Path | str, base='data', verbose=True) -> None:
    """Remove file

    Args:
        filename (Path | str): file to remove
        base (str, optional): Base filepath. Defaults to 'data'.
        verbose (bool, optional): Print what's going on. Defaults to True.

    Raises:
        ValueError: Invalid filename
    """
    match filename:
        case Path():
            base = filename.parent
            filename = filename.name
        case str():
            pass
        case _:
            raise ValueError
    if '.' not in filename:
        filename = f'{filename}.feather'
    P_REMOVE = Path(base) / filename
    if P_REMOVE.exists():
        if verbose:
            size(P_REMOVE.stat().st_size, prefix=f'Deleting "{P_REMOVE}"')
        P_REMOVE.unlink()
    else:
        print(f'"{P_REMOVE}" does not exist...')
    if P_REMOVE.parent.exists() and not any(P_REMOVE.parent.iterdir()):
        print(f'Removing empty directory: "{P_REMOVE.parent}"...')
        P_REMOVE.parent.rmdir()

show(item, hide_docstring=False)

(For Jupyter) Displays function source code or JSON output

Source code in alexwu/alexwu.py
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
def show(item, hide_docstring: bool = False):
    """(For Jupyter) Displays function source code or JSON output"""
    if callable(item):
        code = inspect.getsource(item)
        if hide_docstring:
            function_text = [code.split('"""')[0], *code.split('"""')[2:]]
            code = ''.join([x.rstrip() for x in function_text])
        display_code(code)
    elif isnotebook():
        if isinstance(item, dict):
            try:
                import plotly.io as pio  # type: ignore
                pio.show(item, 'json', False)
            except ModuleNotFoundError:
                display_code(item, 'json')
        elif isinstance(item, str):
            display(Markdown(item))
        else:
            return type(item)
    else:
        return type(item)

size(num, prefix='', deep=True, verbose=True)

Human readable file size (ex: 123.4 KB)

Source code in alexwu/alexwu.py
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
def size(num: Any, prefix='', deep=True, verbose=True):
    """Human readable file size (ex: 123.4 KB)"""
    x = num
    if not isinstance(x, (int, float)):
        num = len(num)
    if isinstance(x, (str, set, dict, list)):
        return print(f'{num:,}') if verbose else f'{num:,}'
    if isinstance(x, pd.DataFrame):
        x = x.memory_usage(deep=deep).sum()
    if isinstance(x, pd.Series):
        x = x.memory_usage(deep=deep)

    for unit in ('bytes', 'KB', 'MB', 'GB', 'TB'):
        if abs(x) < 1024:
            return print(f'{prefix}: {num:,}  ({x:3.1f}+ {unit})') if verbose else (f'{num:,}  ({x:3.1f}+ {unit})')
        x /= 1024
    print(f'{prefix}: {num:,}  ({x:.1f}+ PB)') if verbose else (f'{num:,}  ({x:.1f}+ PB)')

str_contains(pd_series, *regex_str_list, **kwargs)

Filters Pandas Series strings using regex patterns from regex_str_list

Parameters

str

Character sequence or regular expression.

bool, default True

If True, case sensitive.

int, default 0 (no flags)

Flags to pass through to the re module, e.g. re.IGNORECASE.

scalar, optional

Fill value for missing values. The default depends on dtype of the array. For object-dtype, numpy.nan is used. For StringDtype, pandas.NA is used.

bool, default True

If True, assumes the pat is a regular expression.

If False, treats the pat as a literal string.

Source code in alexwu/alexwu.py
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
def str_contains(pd_series, *regex_str_list, **kwargs):
    '''
    Filters Pandas Series strings using regex patterns from `regex_str_list`

    Parameters
    ----------
    pat : str
        Character sequence or regular expression.
    case : bool, default True
        If True, case sensitive.
    flags : int, default 0 (no flags)
        Flags to pass through to the re module, e.g. re.IGNORECASE.
    na : scalar, optional
        Fill value for missing values. The default depends on dtype of the
        array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
        ``pandas.NA`` is used.
    regex : bool, default True
        If True, assumes the pat is a regular expression.

        If False, treats the pat as a literal string.
    '''
    if 'case' not in kwargs:
        kwargs['case'] = False

    match pd_series:
        case pd.Series():
            pass
        case str():
            pd_series = pd.Series([pd_series])
        case _:
            raise ValueError

    mask_list = [pd_series.str.contains(x, **kwargs) for x in regex_str_list]
    pd_series_masked = pd_series[reduce(lambda x,y: x|y, mask_list)]
    return pd_series_masked

trim(pd_series, proportiontocut=0.05, conservative=True)

Trimmed bottom and top percentiles.

Parameters:

Name Type Description Default
pd_series pd.Series

Input values

required
proportiontocut float

Proportion of elements to discard. Defaults to 0.05.

0.05
conservative bool

Keep bordered element. Defaults to True.

True

Raises:

Type Description
ValueError

Cannot trim more than 50% from bottom and top

Returns:

Type Description
pd.Series

pd.Series: Trimmed values

Source code in alexwu/alexwu.py
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
def trim(pd_series: pd.Series, proportiontocut=0.05, conservative=True) -> pd.Series:
    """Trimmed bottom and top percentiles.

    Args:
        pd_series (pd.Series): Input values
        proportiontocut (float, optional): Proportion of elements to discard. Defaults to 0.05.
        conservative (bool, optional): Keep bordered element. Defaults to True.

    Raises:
        ValueError: Cannot trim more than 50% from bottom and top

    Returns:
        pd.Series: Trimmed values
    """
    from math import ceil
    if pd_series.size == 0:
        return pd_series

    nobs = pd_series.shape[0]
    lowercut = int(proportiontocut * nobs) if conservative else ceil(proportiontocut * nobs)
    uppercut = nobs - lowercut
    if lowercut > uppercut:
        raise ValueError("Proportion too big.")

    atmp = np.partition(pd_series, (lowercut, uppercut - 1))
    return pd.Series(atmp[lowercut:uppercut])

wrap_series(fn)

Allows Pandas series operations to apply for other input

Source code in alexwu/alexwu.py
143
144
145
146
147
148
149
150
151
152
153
154
def wrap_series(fn: Callable) -> Callable:
    """Allows Pandas series operations to apply for other input"""
    def wrapper(series, *args):
        not_series = False
        if not isinstance(series, pd.Series):
            not_series = True
            series = pd.Series(series)
        res = fn(series, *args)
        if not_series:
            res = res.iloc[0]
        return res
    return wrapper

write_file(df, filename, overwrite=False, base='data', verbose=True, **kwargs)

Write serialized file

Parameters:

Name Type Description Default
df pd.DataFrame

DataFrame to save to disk

required
filename Path | str

Filename

required
overwrite bool

Overwrite file. Defaults to False.

False
base str

Base path. Defaults to 'data'.

'data'
verbose bool

Verbose. Defaults to True.

True

Raises:

Type Description
ValueError

Invalid filename

ValueError

Invalid file type

Source code in alexwu/alexwu.py
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
def write_file(df: pd.DataFrame, filename: Path | str, overwrite=False, base='data', verbose=True, **kwargs) -> None:
    """Write serialized file

    Args:
        df (pd.DataFrame): DataFrame to save to disk
        filename (Path | str): Filename
        overwrite (bool, optional): Overwrite file. Defaults to False.
        base (str, optional): Base path. Defaults to 'data'.
        verbose (bool, optional): Verbose. Defaults to True.

    Raises:
        ValueError: Invalid filename
        ValueError: Invalid file type
    """
    match filename:
        case Path():
            base = filename.parent
            filename = filename.name
        case str():
            pass
        case _:
            raise ValueError
    if '.' not in filename:
        df = DF(df)
        filename = f'{filename}.feather'
    P_WRITE = Path(base) / filename
    if overwrite or not P_WRITE.exists():
        P_WRITE.parent.mkdir(parents=True, exist_ok=True)
        if filename.endswith('.feather'):
            df.to_feather(P_WRITE, **kwargs)
        elif filename.endswith('.parquet'):
            df.to_parquet(P_WRITE, **kwargs)
        elif filename.endswith('.parquet.gzip'):
            df.to_parquet(P_WRITE, **{'compression': 'gzip', **kwargs})
        elif filename.endswith('.pkl'):
            df.to_pkl(P_WRITE, **kwargs)
        elif filename.endswith('.csv'):
            df.to_csv(P_WRITE, **{'index': False, **kwargs})
        else:
            raise ValueError
        df.pipe(size, prefix='(DataFrame rows)')
    if verbose:
        size(P_WRITE.stat().st_size, prefix=P_WRITE)