Friday, August 26, 2011

Single Pass Random Sampling

I needed to create a random sample from a file containing several million lines. The script below will extract random lines from a file in a single pass while guaranteeing that each sample line was chosen with probability of 1/N (where N is the number of lines in the original file): from contextlib import closing from optparse import OptionParser import random import sys def parse_options(argv, **defaults): options = OptionParser() options.add_option('-n', "--sample-size", action="store", type="int", dest="sample_size", default=defaults.get('sample_size', 100), metavar='SIZE') options.add_option('-o', "--output", action="store", dest="output", default='-', metavar='FILE') return options.parse_args(argv) def sample(sample_size, items): results = [] with closing(items): for count, item in enumerate(items): if len(results) < sample_size: results.append(item) else: should_use = int(random.uniform(0, count)) < sample_size if should_use: replace_index = int(random.uniform(0, sample_size)) results[replace_index] = item return results def main(argv): options, file_paths = parse_options(argv, sample_size=10, file_path=r'c:/temp/branches') sample_size = options.sample_size output = options.output if len(file_paths) > 1 : source = open(file_paths[1]) else: source = sys.stdin results = sample(sample_size, source.xreadlines()) if output == '-': out = sys.stdout else: out = open(output, "wb") with closing(out): for line in results: out.write(line) random.seed() if __name__ == "__main__": main(sys.argv)

No comments: