Some Awful Python Code

Here’s the modeling code from the substack post.

from math import comb
from math import log10
import sys


def tprb(n, k, t):

        totalp = 1

        for i in range(0,k,1):
                p = 1 - (t / (n - i))
                totalp = totalp * p

        return 1-totalp

ctoffset = 45
ctslope  = -3.36

def predictCT(fragments,ampregion,fragmentsize,genomesize):
	totalfragments    = genomesize-fragmentsize
	ontargetfragments = fragmentsize-ampregion
	targetfragments = fragments*(ontargetfragments/totalfragments)
	ct = ctoffset+log10(targetfragments)*ctslope
	return ct

genomesize      = 30000
totalng         = 10
qpcrampregion   = 90
basesperng      = 1771000000000 
rrnafraction    = 0.6
fragmentsize    = 1000
starttargetfraction  = 0.000000001
targetfraction  = starttargetfraction


totalfragments  = (basesperng*totalng)/fragmentsize
reads           = 100000
targetfragments = totalfragments*targetfraction

totalfragments_norRNA = ((totalfragments-targetfragments)*(1-rrnafraction)) + targetfragments

# Vary reads count
#print ("total fragments, reads, target fragments, probability of >1 read, probability of >1 read (no rRNA), predicted CT")
#for creads in range(reads,100000000,1000000):
#	print (totalfragments, " ", creads, " ", targetfragments, " ", tprb(totalfragments,creads,targetfragments), " ", tprb(totalfragments_norRNA,creads,targetfragments),predictCT(targetfragments,qpcrampregion,fragmentsize,genomesize))

# Vary target fraction
#print ("target fraction, total fragments, reads, target fragments, probability of >1 read, probability of >1 read (no rRNA), predicted CT")
#while targetfraction < 1:
#	targetfragments = totalfragments*targetfraction
#	print (targetfraction, " ", totalfragments, " ", reads, " ", targetfragments, " ", tprb(totalfragments,reads,targetfragments), " ", tprb(totalfragments_norRNA,reads,targetfragments),predictCT(targetfragments,qpcrampregion,fragmentsize,genomesize))
#	targetfraction *= 10

# Vary both
#print(", ", end='')
#while targetfraction < 1:
#	targetfragments = totalfragments*targetfraction
#	print(targetfraction, ", ", end='')
#	targetfraction *= 2
#print()

#for creads in range(reads,5000000,100000):
#	print(creads, ", ", end='')
#	targetfraction  = starttargetfraction
#	while targetfraction < 1:
#		targetfragments = totalfragments*targetfraction
#		print (tprb(totalfragments,creads,targetfragments), ", ", end='')
#		
#		targetfraction *= 2
#	print()

# Vary both - CT
print(", ", end='')
while targetfraction < 1:
	targetfragments = totalfragments*targetfraction
	print("{:.2f}".format(predictCT(targetfragments,qpcrampregion,fragmentsize,genomesize)), ", ", end='')
	targetfraction *= 2
print()

for creads in range(reads,5000000,100000):
	print(creads, ", ", end='')
	targetfraction  = starttargetfraction
	while targetfraction < 1:
		targetfragments = totalfragments*targetfraction
		print (tprb(totalfragments,creads,targetfragments), ", ", end='')
		
		targetfraction *= 2
	print()

Ion314 die images

N choose K, with T targets. Probability of choosing at least one target

Looking at n choose k, but you want to get at least one of a target t. For example you have a bag of blue marbles, which contains some number of red marbles. N is the total number of marbles in the bag. K is the number of marbles being chosen. T is the number of red marbles.

This is probably well known, but I worked through it a few different ways anyway…

First a simulation (C++):

#include <iostream>
#include <vector>

using namespace std;


int main(int argc,char **argv) {

  int n = 3000;
  int k = 50;
  int t = 30; // targets

  vector<int> items(n,0);

  for(int i=0;i<t;i++) {
    int itm=rand()%items.size();
    for(;items[itm] !=0;) itm = rand()%items.size();
    items[itm] = 1;
    //items[i] = 1;
  }

  //cout << "items: ";
  //for(int i=0;i<items.size();i++) cout << items[i] << " ";
  //cout << endl;

  vector<int> count(t,0);

  // rounds of choosing
  int rounds=100000;
  for(int j=0;j<rounds;j++) {
    cout << "round: " << j << endl;
    int tcount=0;

    vector<int> selected;
    for(int i=0;i<k;i++) {

      int itm = rand()%items.size();
      for(;std::count(selected.begin(), selected.end(), itm);) itm = rand()%items.size();
      selected.push_back(itm);
      //cout << "selected: " << itm << " " << items[itm] << endl;
      if(i%100000==0) cout << i << endl;
    }

    //count items on target
    for(int i=0;i<selected.size();i++) {
      if(items[selected[i]] == 1) tcount++;
    }
    cout << "tcount: " << tcount << endl;

    count[tcount]++;
  }

  int morezero=0;
  for(int i=0;i<t;i++) {
    if((count[i] > 0) || (i==0)) cout << i << " " << count[i] << endl;
    if(i>=1) morezero+=count[i];
  }
  cout << "One or more: " << morezero << endl;
  cout << "One or more fraction: " << (double)morezero/rounds << endl;
}

Next I calculated this by looking at the number of combinations calculating the fraction of combinations containing at least one target and total number of combinations:

from math import comb
import sys
def totalcomb(n, k, t):
        return comb(n,k)
def targetcomb(n, k, t):

        total = 0
        total += comb(t,k)
        for i in range(k-1,0,-1): #(i=k-1;i>0;i--)
                # print(t," ",i, ", ", n, " ", k-i)
                # Ways of choosing i items out of t. Multiply by
                total += comb(t,i)*comb(n-t,k-i)
        return total

n = int(sys.argv[1])
k = int(sys.argv[2])
t = int(sys.argv[3])
print (targetcomb(n,k,t)/totalcomb(n,k,t))
print (targetcomb(n,k,t))
print (totalcomb(n,k,t))

Then in terms of probability, using the probability that each draw does not contain a target:

from math import comb
import sys

def tprb(n, k, t):

        totalp = 1

        for i in range(0,k,1):
                p = 1 - (t / (n - i))
                totalp = totalp * p

        return 1-totalp

n = int(sys.argv[1])
k = int(sys.argv[2])
t = int(sys.argv[3])

print (tprb(n,k,t))

All approaches give the same answer, the last is the least computationally taxing…

Illumina iSeq (FireFly) IC Images

Comments will be posted over on the substack.