Python Packages

Autograd

import autograd.numpy as np
from autograd import grad, jacobian

f = loglikelihood
df = grad(f) # Fisher's score
d2f = jacobian(df) # Hessian

https://github.com/HIPS/autograd/blob/master/docs/tutorial.md

Beautiful Soup

Pulling data out of HTML and XML files. https://www.crummy.com/software/BeautifulSoup/bs4/doc/

from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

soup.children
soup.find('p') # returns first instance
soup.find_all(tag=None, class=None, id=None)
elem.get_text()

# searching using CSS selectors
soup.select("div p")

tag['attribute']

Bokeh

from bokeh.plotting import figure, show

# data
x = [1, 2, 3, 4, 5]
y1 = [6, 7, 2, 4, 5]
y2 = [2, 3, 4, 5, 6]
y3 = [4, 5, 5, 7, 2]

# create a new plot
p = figure(title="Example", x_axis_label="x", y_axis_label="y")

# add renderers
p.line(x, y1, legend_label="Temp.", color="blue", line_width=2)
p.vbar(x=x, top=y2, legend_label="Rate", width=0.5, bottom=0, color="red")
circle = p.scatter(
    x,
    y3,
    marker="circle",
    size=80,
    legend_label="Objects",
    fill_color="red",
    fill_alpha=0.5,
    line_color="blue",
)

# change object's glyph
glyph = circle.glyph
glyph.fill_color = "blue"

# toolbar
p.toolbar.autohide = True
p.toolbar.logo = None

# view
show(p)
save(p)

Openpyxl

import openpyxl

wb = openpyxl.load_workbook(file, read_only=True, data_only=True)
ws = wb["sheet_name"]
val = ws.cell(1, 1).value
[cell[i] for cell in ws.iter_rows(min_row=..., max_row=..., min_col=..., max_col=..., values_only=True)]

CharDet

with open("path.file", 'rb') as f:
    result = chardet.detect(f.read(10000))
    print(result)

FuzzyWuzzy

import fuzzywuzzy
matches = fuzzywuzzy.process.extract("south korea", countries, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

Hypothesis

https://hypothesis.readthedocs.io/en/latest/quickstart.html

from hypothesis import given, strategies as st

@given(st.floats(), st.floats())
def test_assoc(x, y):
    assert x * y == y * x

# AssertionError:
# Falsifying example: test_(
#     x=0.0,
#     y=inf,
# )

Matplotlib

Architecture

Scripting
- pyplot provides a quick and dirty way to work with Figure
Artist
- About:
  - knows how to use pen to put ink on paper
  - everything on a Figure (axes, labels, ticks, lines) are instances of Artist
  - translates artist co-ord system to that of the canvas
- Primitive: Line2D, Rectangle, Circle, Text
- Composite: collections of Artist: Axis, Tick, Axes, Figure
Backend
- FigureCanvas (“paper”)
- Renderer (“pen”). Each new renderer has to implement
  - draw_path: Draws compound polygons, made up of line and Bézier segments.
  - draw_image: Draws raster images.
  - draw_text: Draws text with the given font properties.
- Event (“user interaction”)

class SomeArtist(Artist):
    'An example Artist that implements the draw method'

    def draw(self, renderer):
        """Call the appropriate renderer methods to paint self onto canvas"""
        if not self.get_visible():  return

        # create some objects and use renderer to draw self here
        renderer.draw_path(graphics_context, path, transform)

# TODO: consider using
# https://matplotlib.org/3.5.1/api/ticker_api.html#matplotlib.ticker.FixedFormatter

millnames = ['','K','M','B']

def millify(n, decimals=0):
    n = float(n)
    millidx = max(0,min(len(millnames)-1, int(math.floor(0 if n == 0 else math.log10(abs(n))/3))))
    return ('{:,.' + str(decimals) + 'f}{}').format(n / 10**(3 * millidx), millnames[millidx])

def format_yaxis(ax, decimals=0):
    ax.get_yaxis().set_major_formatter(
        matplotlib.ticker.FuncFormatter(lambda x, p: millify(x, decimals))) # format(int(x), ',')

# set y-axis range
plt.ylim(bottom=0)
ax.set_ylim(bottom=0)

Examples

plt.scatter(x, y, )
ax.hist(data, bins=50, density=True, histtype="step")

Subfigures:

import matplotlib.ticker as mtick

fig = plt.figure(constrained_layout=True)
fig.suptitle('Figure title')

# create 3x1 subfigs
subfigs = fig.subfigures(nrows=3, ncols=1)
for row, subfig in enumerate(subfigs):
    subfig.suptitle(f'Subfigure title {row}')

    # create 1x3 subplots per subfig
    axs = subfig.subplots(nrows=1, ncols=3, sharex='col', sharey='row')
    for col, ax in enumerate(axs):
        ax.plot(x, y, c=color_values, s=10)
        ax.set_title(f'Plot title {col}')
        ax.hist(x, density=True, histtype="step")

        ax.set_ylim((0,None))

        # gridlines
        ax.set_axisbelow(True) # ensure gridlines are in background
        ax.grid(visible=True, which='major', axis='both')

        # h-line at 0.0; use v-line
        ax.axhline(0.0, linestyle='--', color='gray')

        ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) # 1=100%

        ax.colorbar()

fig.savefig(fig_path.joinpath(f"{name}.pdf"), bbox_inches='tight')

Colors

old: b, o, g, …
new: C0, C1, C2, …

3d charts

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d

%matplotlib qt # to make it interactive in notebooks

fig = plt.figure()
ax = plt.axes(projection='3d')

ax.plot3D(xline, yline, zline, 'gray')
ax.scatter3D(xdata, ydata, zdata, c=zdata, cmap='Greens')
ax.contour3D(X, Y, Z, 50, cmap='binary')
ax.plot_wireframe(X, Y, Z, color='black')
ax.plot_surface(X, Y, Z, rstride=1, cstride=1,
                cmap='viridis', edgecolor='none')
ax.view_init(60, 35)

Images

matplotlib.pyplot.imread

The image data. The returned array has shape

(M, N) for grayscale images.
(M, N, 3) for RGB images.
(M, N, 4) for RGBA images.

Plotnine

Uses grammar of graphics. Mostly similar to ggplot2 in R. Uses matplotlib.

from plotnine import *

(ggplot(data=<DATA>) +
    <GEOM_FUNCTION>(
       mapping=aes(<MAPPINGS>),
       stat=<STAT>,
       position=<POSITION>
    ) +
    <COORDINATE_FUNCTION> +
    <FACET_FUNCTION> +
    labs(x, y, title, color) +
    theme() +
    scale_x_log10() +
    scale_y_log10()
)

(ggplot(date) +
    geom_point() +
    facet_wrap("var", nrow=2)
    )

Aesthetics are a visual property of the objects in your plot.

Geometric objects are used to represent the data.

geom_point(mapping=aes(x=_, y=_, color=_, size=_, alpha=_, shape=_))
geom_smooth(mapping=aes(x, y, linetype, group, color))
geom_bar(mapping=aex(..., fill, ), stat=)
geom_boxplot()
geom_abline()
geom_text()
geom_label()
geom_segment()

Statistics are used to summarise the data.

stat_count()
stat_summary(mappig, fun_ymin, fun_ymax, fun_y)

Facets are used to create subplots.

facet_wrap(‘var’)
facet_grid(‘var1 ~ var2’)

Requests

https://2.python-requests.org/en/latest/user/quickstart/

import requests

api = r"https://localhost:3000"

r = requests.get(api, params={"q" : url})
r = requests.post(api, data = {'key':'value'})

r.json()
r.headers # a dict
r.headers["Content-Type"]
r.status_code == 404
r.content
r.ok
r.url
r.text
r.encoding

Scikit-Learn

Encode categorical data. Can use OneHotEncoder too.

from sklearn.preprocessing import OrdinalEncoder

# Make copy to avoid changing original data
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply ordinal encoder to each column with categorical data
encoder = OrdinalEncoder()
label_X_train[object_cols] = encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = encoder.transform(X_valid[object_cols])

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='constant') # for num; for cat use 'most_frequent'
imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imputed_X_test = pd.DataFrame(imputer.transform(X_test))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_test.columns = X_test.columns

from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_fit)

Pipelines, useful for cleaning up and productionizing.

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

num_transformer = ...
cat_transformer = Pipeline(steps=[
    ('imputer', ...),
    ('onehot', ...)
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ])

model = ...
# Bundle preprocessing and modeling code in a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_valid)