#!/usr/bin/env python

import csv
import getopt
import os
import subprocess
import sys

def usage(f=sys.stdout):
    f.write("""\
Usage: %s -f field1,field2 file_a file_b

Does a full outer join on two CSV files by equality on the field names
given by the -f option. The first line of each file must be a header
with field names.

The input files do not have to be sorted. They will be sorted internally
by passing them to the "sort" program with comma-separated keys. So the
input rows must be parseable by simple splitting on commas, and cannot
use, for example, quoted strings with embedded commas.
""" % sys.argv[0])

# Call sort in a subprocess using the given indices as keys and return
# the subprocess's stdout.
def sort(f, indices):
    p = subprocess.Popen(["sort", "-t,", "-k", ",".join(str(i+1) for i in indices)],
        stdin=f, stdout=subprocess.PIPE,
        env={"LC_ALL": "C"},
        bufsize=-1)
    return p.stdout

def readrow(f):
    line = f.readline()
    if not line:
        return None
    return line.strip().split(",")

def getkey(row, indices):
    return tuple(row[i] for i in indices)

def makeoutputrow(row_a, field_indices_a, row_b, field_indices_b):
    row = []
    if row_a:
        row.extend([row_a[i] for i in field_indices_a])
    else:
        row.extend([row_b[i] for i in field_indices_b])
    if row_a:
        row.extend([row_a[i] for i in range(len(row_a)) if i not in field_indices_a])
    else:
        row.extend(["" for i in range(len(fieldnames_a)) if i not in field_indices_a])
    if row_b:
        row.extend([row_b[i] for i in range(len(row_b)) if i not in field_indices_b])
    else:
        row.extend(["" for i in range(len(fieldnames_b)) if i not in field_indices_b])
    return row

def join(csvw, file_a, field_indices_a, file_b, field_indices_b):
    row_a = readrow(file_a)
    row_b = readrow(file_b)
    while row_a and row_b:
        key_a = getkey(row_a, field_indices_a)
        key_b = getkey(row_b, field_indices_b)
        if key_a < key_b:
            csvw.writerow(makeoutputrow(row_a, field_indices_a, None, field_indices_b))
            row_a = readrow(file_a)
            continue
        if key_a > key_b:
            csvw.writerow(makeoutputrow(None, field_indices_a, row_b, field_indices_b))
            row_b = readrow(file_b)
            continue
        # The row keys are equal.
        buffered_rows = []
        while row_b and getkey(row_b, field_indices_b) == key_a:
            buffered_rows.append(row_b)
            row_b = readrow(file_b)
        while row_a and getkey(row_a, field_indices_a) == key_a:
            for buffered_row in buffered_rows:
                csvw.writerow(makeoutputrow(row_a, field_indices_a, buffered_row, field_indices_b))
            row_a = readrow(file_a)
    # Now one of the input files is exhausted. Keep reading (without
    # comparing keys) until they both are.
    while row_a or row_b:
        csvw.writerow(makeoutputrow(row_a, field_indices_a, row_b, field_indices_b))
        row_a = readrow(file_a)
        row_b = readrow(file_b)

FIELDNAMES = None

opts, args = getopt.gnu_getopt(sys.argv[1:], "f:h", ["help"])
for o, a in opts:
    if o == "-f":
        FIELDNAMES = a.split(",")
    elif o == "-h" or o == "--help":
        usage()
        sys.exit()

if not FIELDNAMES:
    print >> sys.stderr, "Provide field names with the -f option."
    sys.exit(1)

# Turn off buffering because we're just going to read the header and pass the
# rest to a subprocess.
if len(args) == 1:
    file_a = open(args[0], "rb", 0)
    file_b = os.fdopen(sys.stdin, "rb", 0)
elif len(args) == 2:
    file_a = open(args[0], "rb", 0)
    file_b = open(args[1], "rb", 0)
else:
    print >> sys.stderr, "Provide two input file names."
    sys.exit(1)

fieldnames_a = readrow(file_a)
fieldnames_b = readrow(file_b)

field_indices_a = []
field_indices_b = []
for fieldname in FIELDNAMES:
    try:
        field_indices_a.append(fieldnames_a.index(fieldname))
    except ValueError:
        print >> sys.stderr, "Field name %r is not in the first file." % fieldname
        sys.exit(1)
    try:
        field_indices_b.append(fieldnames_b.index(fieldname))
    except ValueError:
        print >> sys.stderr, "Field name %r is not in the second file." % fieldname
        sys.exit(1)

fieldnames_out = [fieldnames_a[i] for i in field_indices_a]
fieldnames_out.extend([fieldnames_a[i] for i in range(len(fieldnames_a)) if i not in field_indices_a])
fieldnames_out.extend([fieldnames_b[i] for i in range(len(fieldnames_b)) if i not in field_indices_b])

csvw = csv.writer(sys.stdout, lineterminator="\n")
csvw.writerow(fieldnames_out)

join(csvw,
    sort(file_a, field_indices_a), field_indices_a,
    sort(file_b, field_indices_b), field_indices_b)