Friday, October 07, 2011

Stratified Randomisation 2: The Working Version

Yesterday I posted a basic proof-of-concept class for generating a randomisation schedule based on strata. Unfortunately, this code was flawed and was incorrect in its results: because it uses block randomisation within each stratum, the most a single stratum could differ by is the size of the block used minus 1. The following updated code is now completely self-contained, having no dependencies on the original article's code, and uses a generator to produce the schedule.

from random import choice
from collections import Counter, defaultdict
from itertools import permutations

class StratifiedProper(object):
    def __init__(self, strata, arms, block_size):
        self.strata = strata
        self.arms = arms
        self.block_size = block_size
        self.record = defaultdict(list)

        self.blocks = self.block_gen()
        self.blocks.next()

    def get_block(self, choices, size):
        return choice(
            list(permutations(choices * (size/len(choices))))
        )

    def new_block(self):
        return list(
            self.get_block(self.arms, self.block_size)
        )

    def block_gen(self):
        blocks = defaultdict(list)
        is_empty = lambda x: len(x) == 0
        stratum = yield None
        while True:
            if is_empty(blocks[stratum]):
                blocks[stratum] = self.new_block()
            pop = blocks[stratum].pop()
            stratum = yield pop

    def add(self, stratum):
        flip = self.blocks.send(stratum)
        self.record[stratum].append(flip)


strata = ['M<30', 'M>30', 'F<30', 'F>30']
arms = ['H','T']

stratrand = StratifiedProper(strata, arms, 2)
for _ in xrange(100000):
    stratrand.add(choice(strata))

for stratum in strata:
    count = Counter(stratrand.record[stratum])
    count['diff'] = abs(count['H']-count['T'])
    print stratum, 'H=%(H)d T=%(T)d Diff: %(diff)d' % count
M<30 H=12478 T=12478 Diff: 0
M>30 H=12448 T=12448 Diff: 0
F<30 H=12469 T=12470 Diff: 1
F>30 H=12605 T=12604 Diff: 1

M<30 H=12454 T=12454 Diff: 0
M>30 H=12442 T=12442 Diff: 0
F<30 H=12552 T=12552 Diff: 0
F>30 H=12552 T=12552 Diff: 0

M<30 H=12476 T=12477 Diff: 1
M>30 H=12455 T=12455 Diff: 0
F<30 H=12580 T=12580 Diff: 0
F>30 H=12488 T=12489 Diff: 1

M<30 H=12590 T=12589 Diff: 1
M>30 H=12427 T=12426 Diff: 1
F<30 H=12463 T=12464 Diff: 1
F>30 H=12521 T=12520 Diff: 1
Much better. While the difference between characteristics persists due to the random nature of the intake, the per characteristic assignment to a treatment is balanced, or at most off by block_size - 1.

No comments:

Post a Comment