Squigglepy: Implementation of Squiggle in Python

Squiggle is a "simple programming language for intuitive probabilistic estimation". It serves as its own standalone programming language with its own syntax, but it is implemented in JavaScript. I like the features of Squiggle and intend to use it frequently, but I also sometimes want to use similar functionalities in Python, especially alongside other Python statistical programming packages like Numpy, Pandas, and Matplotlib. The squigglepy package here implements many Squiggle-like functionalities in Python.

Installation

pip install squigglepy

For plotting support, you can also use the plots extra:

pip install squigglepy[plots]

Usage

Piano Tuners Example

Here's the Squigglepy implementation of the example from Squiggle Docs:

import squigglepy as sq
import numpy as np
import matplotlib.pyplot as plt
from squigglepy.numbers import K, M
from pprint import pprint

pop_of_ny_2022 = sq.to(8.1*M, 8.4*M)  # This means that you're 90% confident the value is between 8.1 and 8.4 Million.
pct_of_pop_w_pianos = sq.to(0.2, 1) * 0.01  # We assume there are almost no people with multiple pianos
pianos_per_piano_tuner = sq.to(2*K, 50*K)
piano_tuners_per_piano = 1 / pianos_per_piano_tuner
total_tuners_in_2022 = pop_of_ny_2022 * pct_of_pop_w_pianos * piano_tuners_per_piano
samples = total_tuners_in_2022 @ 1000  # Note: `@ 1000` is shorthand to get 1000 samples

# Get mean and SD
print('Mean: {}, SD: {}'.format(round(np.mean(samples), 2),
                                round(np.std(samples), 2)))

# Get percentiles
pprint(sq.get_percentiles(samples, digits=0))

# Histogram
plt.hist(samples, bins=200)
plt.show()

# Shorter histogram
total_tuners_in_2022.plot()

And the version from the Squiggle doc that incorporates time:

import squigglepy as sq
from squigglepy.numbers import K, M

pop_of_ny_2022 = sq.to(8.1*M, 8.4*M)
pct_of_pop_w_pianos = sq.to(0.2, 1) * 0.01
pianos_per_piano_tuner = sq.to(2*K, 50*K)
piano_tuners_per_piano = 1 / pianos_per_piano_tuner

def pop_at_time(t):  # t = Time in years after 2022
    avg_yearly_pct_change = sq.to(-0.01, 0.05)  # We're expecting NYC to continuously grow with an mean of roughly between -1% and +4% per year
    return pop_of_ny_2022 * ((avg_yearly_pct_change + 1) ** t)

def total_tuners_at_time(t):
    return pop_at_time(t) * pct_of_pop_w_pianos * piano_tuners_per_piano

# Get total piano tuners at 2030
sq.get_percentiles(total_tuners_at_time(2030-2022) @ 1000)

WARNING: Be careful about dividing by K, M, etc. 1/2*K = 500 in Python. Use 1/(2*K) instead to get the expected outcome.

WARNING: Be careful about using K to get sample counts. Use sq.norm(2, 3) @ (2*K)... sq.norm(2, 3) @ 2*K will return only two samples, multiplied by 1000.

Distributions

import squigglepy as sq

# Normal distribution
sq.norm(1, 3)  # 90% interval from 1 to 3

# Distribution can be sampled with mean and sd too
sq.norm(mean=0, sd=1)

# Shorthand to get one sample
~sq.norm(1, 3)

# Shorthand to get more than one sample
sq.norm(1, 3) @ 100

# Longhand version to get more than one sample
sq.sample(sq.norm(1, 3), n=100)

# Nice progress reporter
sq.sample(sq.norm(1, 3), n=1000, verbose=True)

# Other distributions exist
sq.lognorm(1, 10)
sq.invlognorm(1, 10)
sq.tdist(1, 10, t=5)
sq.triangular(1, 2, 3)
sq.pert(1, 2, 3, lam=2)
sq.binomial(p=0.5, n=5)
sq.beta(a=1, b=2)
sq.bernoulli(p=0.5)
sq.poisson(10)
sq.chisquare(2)
sq.gamma(3, 2)
sq.pareto(1)
sq.exponential(scale=1)
sq.geometric(p=0.5)

# Discrete sampling
sq.discrete({'A': 0.1, 'B': 0.9})

# Can return integers
sq.discrete({0: 0.1, 1: 0.3, 2: 0.3, 3: 0.15, 4: 0.15})

# Alternate format (also can be used to return more complex objects)
sq.discrete([[0.1,  0],
             [0.3,  1],
             [0.3,  2],
             [0.15, 3],
             [0.15, 4]])

sq.discrete([0, 1, 2]) # No weights assumes equal weights

# You can mix distributions together
sq.mixture([sq.norm(1, 3),
            sq.norm(4, 10),
            sq.lognorm(1, 10)],  # Distributions to mix
           [0.3, 0.3, 0.4])     # These are the weights on each distribution

# This is equivalent to the above, just a different way of doing the notation
sq.mixture([[0.3, sq.norm(1,3)],
            [0.3, sq.norm(4,10)],
            [0.4, sq.lognorm(1,10)]])

# Make a zero-inflated distribution
# 60% chance of returning 0, 40% chance of sampling from `norm(1, 2)`.
sq.zero_inflated(0.6, sq.norm(1, 2))

Additional Features

import squigglepy as sq

# You can add and subtract distributions
(sq.norm(1,3) + sq.norm(4,5)) @ 100
(sq.norm(1,3) - sq.norm(4,5)) @ 100
(sq.norm(1,3) * sq.norm(4,5)) @ 100
(sq.norm(1,3) / sq.norm(4,5)) @ 100

# You can also do math with numbers
~((sq.norm(sd=5) + 2) * 2)
~(-sq.lognorm(0.1, 1) * sq.pareto(1) / 10)

# You can change the CI from 90% (default) to 80%
sq.norm(1, 3, credibility=80)

# You can clip
sq.norm(0, 3, lclip=0, rclip=5) # Sample norm with a 90% CI from 0-3, but anything lower than 0 gets clipped to 0 and anything higher than 5 gets clipped to 5.

# You can also clip with a function, and use pipes
sq.norm(0, 3) >> sq.clip(0, 5)

# You can correlate continuous distributions
a, b = sq.uniform(-1, 1), sq.to(0, 3)
a, b = sq.correlate((a, b), 0.5)  # Correlate a and b with a correlation of 0.5
# You can even pass your own correlation matrix!
a, b = sq.correlate((a, b), [[1, 0.5], [0.5, 1]])

Example: Rolling Dice and Flipping Coins

Squigglepy has built-in support for dice and coins as distribution objects:

import squigglepy as sq

# Roll a 6-sided die
sq.die(6) @ 10  # Roll 10 times
# [2, 6, 5, 2, 6, 2, 3, 1, 5, 2]

# Use the ~ operator for a single sample
~sq.die(6)  # Roll once
# 4

# Flip a coin
~sq.coin()
# 'heads'

sq.coin() @ 5  # Flip 5 times
# ['heads', 'tails', 'heads', 'heads', 'tails']

# Exploding dice (roll again when you get certain values)
sq.die(6, explode_on=6) @ 5  # D6 that explodes on 6
# [3, 8, 2, 5, 1]  # The 8 came from rolling 6 + 2

Since die and coin are distribution objects, they work with all distribution operations like +, -, *, /, and mixtures.

Model Functions

You can define a model as a Python function that uses squigglepy distributions, then sample from it using sq.sample. This is useful for building more complex probabilistic models:

import squigglepy as sq

# Define a simple model function
def revenue_model():
    customers = sq.to(100, 500)  # 90% CI: 100-500 customers
    revenue_per_customer = sq.to(10, 50)  # 90% CI: $10-$50 per customer
    return customers * revenue_per_customer

# Sample from the model
samples = sq.sample(revenue_model, n=1000)

# Analyze the results
sq.get_percentiles(samples)

Model functions can include conditional logic and more complex structures:

import squigglepy as sq

def project_cost_model():
    # Base cost estimate
    base_cost = sq.to(50_000, 100_000)

    # Risk factor - 30% chance of complications
    if sq.event(0.3):
        # Complications add 20-50% to cost
        multiplier = sq.uniform(1.2, 1.5)
    else:
        multiplier = sq.const(1.0)

    return base_cost * multiplier

# Get 1000 samples from the model
results = sq.sample(project_cost_model, n=1000)
print(sq.get_percentiles(results))

Bayesian inference

1% of women at age forty who participate in routine screening have breast cancer. 80% of women with breast cancer will get positive mammographies. 9.6% of women without breast cancer will also get positive mammographies.

A woman in this age group had a positive mammography in a routine screening. What is the probability that she actually has breast cancer?

We can approximate the answer with a Bayesian network (uses rejection sampling):

import squigglepy as sq
from squigglepy import bayes
from squigglepy.numbers import M

def mammography(has_cancer):
    return sq.event(0.8 if has_cancer else 0.096)

def define_event():
    cancer = ~sq.bernoulli(0.01)
    return({'mammography': mammography(cancer),
            'cancer': cancer})

bayes.bayesnet(define_event,
               find=lambda e: e['cancer'],
               conditional_on=lambda e: e['mammography'],
               n=1*M)
# 0.07723995880535531

Or if we have the information immediately on hand, we can directly calculate it. Though this doesn't work for very complex stuff.

from squigglepy import bayes
bayes.simple_bayes(prior=0.01, likelihood_h=0.8, likelihood_not_h=0.096)
# 0.07763975155279504

You can also make distributions and update them:

import matplotlib.pyplot as plt
import squigglepy as sq
from squigglepy import bayes
from squigglepy.numbers import K
import numpy as np

print('Prior')
prior = sq.norm(1,5)
prior_samples = prior @ (10*K)
plt.hist(prior_samples, bins = 200)
plt.show()
print(sq.get_percentiles(prior_samples))
print('Prior Mean: {} SD: {}'.format(np.mean(prior_samples), np.std(prior_samples)))
print('-')

print('Evidence')
evidence = sq.norm(2,3)
evidence_samples = evidence @ (10*K)
plt.hist(evidence_samples, bins = 200)
plt.show()
print(sq.get_percentiles(evidence_samples))
print('Evidence Mean: {} SD: {}'.format(np.mean(evidence_samples), np.std(evidence_samples)))
print('-')

print('Posterior')
posterior = bayes.update(prior, evidence)
posterior_samples = posterior @ (10*K)
plt.hist(posterior_samples, bins = 200)
plt.show()
print(sq.get_percentiles(posterior_samples))
print('Posterior Mean: {} SD: {}'.format(np.mean(posterior_samples), np.std(posterior_samples)))

print('Average')
average = bayes.average(prior, evidence)
average_samples = average @ (10*K)
plt.hist(average_samples, bins = 200)
plt.show()
print(sq.get_percentiles(average_samples))
print('Average Mean: {} SD: {}'.format(np.mean(average_samples), np.std(averag

Squigglepy

Install / Use

README