| 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
 | #!/usr/bin/env python
#
# Copyright (C) 2018 The Android Open Source Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Merge multiple CSV files, possibly with different columns.
"""
import argparse
import csv
import io
import heapq
import itertools
import operator
from zipfile import ZipFile
args_parser = argparse.ArgumentParser(
    description='Merge given CSV files into a single one.'
)
args_parser.add_argument(
    '--header',
    help='Comma separated field names; '
    'if missing determines the header from input files.',
)
args_parser.add_argument(
    '--zip_input',
    help='Treat files as ZIP archives containing CSV files to merge.',
    action="store_true",
)
args_parser.add_argument(
    '--key_field',
    help='The name of the field by which the rows should be sorted. '
    'Must be in the field names. '
    'Will be the first field in the output. '
    'All input files must be sorted by that field.',
)
args_parser.add_argument(
    '--output',
    help='Output file for merged CSV.',
    default='-',
    type=argparse.FileType('w'),
)
args_parser.add_argument('files', nargs=argparse.REMAINDER)
args = args_parser.parse_args()
def dict_reader(csvfile):
    return csv.DictReader(csvfile, delimiter=',', quotechar='|')
csv_readers = []
if not args.zip_input:
    for file in args.files:
        csv_readers.append(dict_reader(open(file, 'r')))
else:
    for file in args.files:
        with ZipFile(file) as zipfile:
            for entry in zipfile.namelist():
                if entry.endswith('.uau'):
                    csv_readers.append(
                        dict_reader(io.TextIOWrapper(zipfile.open(entry, 'r')))
                    )
if args.header:
    fieldnames = args.header.split(',')
else:
    headers = {}
    # Build union of all columns from source files:
    for reader in csv_readers:
        for fieldname in reader.fieldnames:
            headers[fieldname] = ""
    fieldnames = list(headers.keys())
# By default chain the csv readers together so that the resulting output is
# the concatenation of the rows from each of them:
all_rows = itertools.chain.from_iterable(csv_readers)
if len(csv_readers) > 0:
    keyField = args.key_field
    if keyField:
        assert keyField in fieldnames, (
            "--key_field {} not found, must be one of {}\n"
        ).format(keyField, ",".join(fieldnames))
        # Make the key field the first field in the output
        keyFieldIndex = fieldnames.index(args.key_field)
        fieldnames.insert(0, fieldnames.pop(keyFieldIndex))
        # Create an iterable that performs a lazy merge sort on the csv readers
        # sorting the rows by the key field.
        all_rows = heapq.merge(*csv_readers, key=operator.itemgetter(keyField))
# Write all rows from the input files to the output:
writer = csv.DictWriter(
    args.output,
    delimiter=',',
    quotechar='|',
    quoting=csv.QUOTE_MINIMAL,
    dialect='unix',
    fieldnames=fieldnames,
)
writer.writeheader()
# Read all the rows from the input and write them to the output in the correct
# order:
for row in all_rows:
    writer.writerow(row)
 |