from contextlib import closing
from optparse import OptionParser
import random
import sys
def parse_options(argv, **defaults):
options = OptionParser()
options.add_option('-n', "--sample-size",
action="store",
type="int",
dest="sample_size",
default=defaults.get('sample_size', 100),
metavar='SIZE')
options.add_option('-o', "--output",
action="store",
dest="output",
default='-',
metavar='FILE')
return options.parse_args(argv)
def sample(sample_size, items):
results = []
with closing(items):
for count, item in enumerate(items):
if len(results) < sample_size:
results.append(item)
else:
should_use = int(random.uniform(0, count)) < sample_size
if should_use:
replace_index = int(random.uniform(0, sample_size))
results[replace_index] = item
return results
def main(argv):
options, file_paths = parse_options(argv, sample_size=10, file_path=r'c:/temp/branches')
sample_size = options.sample_size
output = options.output
if len(file_paths) > 1 :
source = open(file_paths[1])
else:
source = sys.stdin
results = sample(sample_size, source.xreadlines())
if output == '-':
out = sys.stdout
else:
out = open(output, "wb")
with closing(out):
for line in results:
out.write(line)
random.seed()
if __name__ == "__main__":
main(sys.argv)
Friday, August 26, 2011
Single Pass Random Sampling
I needed to create a random sample from a file containing several million lines. The script below will extract random lines from a file in a single pass while guaranteeing that each sample line was chosen with probability of 1/N (where N is the number of lines in the original file):
Thursday, August 25, 2011
Week Sequence in Posgres
Here's how to generate a list of "week of" dates for the last 12 weeks using Postgres SQL:
select date_trunc('week', current_date)::date - s.t as "weekOf"
from generate_series(0, 7*12, 7) as s(t)
order by "weekOf" asc;
Result:
- 5/30/2011
- 6/6/2011
- 6/13/2011
- 6/20/2011
- 6/27/2011
- 7/4/2011
- 7/11/2011
- 7/18/2011
- 7/25/2011
- 8/1/2011
- 8/8/2011
- 8/15/2011
- 8/22/2011
Subscribe to:
Posts (Atom)