tools/aapt2/remove-duplicates.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181

#!/usr/bin/env python

import os
import os.path
import sys
import tempfile
import xml.parsers.expat

"""
Scans each resource file in res/values/ looking for duplicates.
All but the last occurrence of resource definition are removed.
This creates no semantic changes, the resulting APK when built
should contain the same definition.
"""

class Duplicate:
    """A small struct to maintain the positions of a Duplicate resource definition."""
    def __init__(self, name, product, depth, start, end):
        self.name = name
        self.product = product
        self.depth = depth
        self.start = start
        self.end = end

class ResourceDefinitionLocator:
    """Callback class for xml.parsers.expat which records resource definitions and their
    locations.
    """
    def __init__(self, parser):
        self.resource_definitions = {}
        self._parser = parser
        self._depth = 0
        self._current_resource = None

    def start_element(self, tag_name, attrs):
        self._depth += 1
        if self._depth == 2 and tag_name not in ["public", "java-symbol", "eat-comment", "skip"]:
            resource_name = None
            product = ""
            try:
                product = attrs["product"]
            except KeyError:
                pass

            if tag_name == "item":
                resource_name = "{0}/{1}".format(attrs["type"], attrs["name"])
            else:
                resource_name = "{0}/{1}".format(tag_name, attrs["name"])
            self._current_resource = Duplicate(
                    resource_name,
                    product,
                    self._depth,
                    (self._parser.CurrentLineNumber - 1, self._parser.CurrentColumnNumber),
                    None)

    def end_element(self, tag_name):
        if self._current_resource and self._depth == self._current_resource.depth:
            # Record the end position of the element, which is the length of the name
            # plus the </> symbols (len("</>") == 3).
            self._current_resource.end = (self._parser.CurrentLineNumber - 1,
                    self._parser.CurrentColumnNumber + 3 + len(tag_name))
            key_name = "{0}:{1}".format(self._current_resource.name,
                    self._current_resource.product)
            try:
                self.resource_definitions[key_name] += [self._current_resource]
            except KeyError:
                self.resource_definitions[key_name] = [self._current_resource]
            self._current_resource = None
        self._depth -= 1

def remove_duplicates(xml_path):
    """Reads the input file and generates an output file with any duplicate
    resources removed, keeping the last occurring definition and removing
    the others. The output is written to a temporary and then renamed
    to the original file name.
    """
    input = ""
    with open(xml_path) as fin:
        input = fin.read()

    parser = xml.parsers.expat.ParserCreate("utf-8")
    parser.returns_unicode = True
    tracker = ResourceDefinitionLocator(parser)
    parser.StartElementHandler = tracker.start_element
    parser.EndElementHandler = tracker.end_element
    parser.Parse(input)

    # Treat the input as UTF-8 or else column numbers will be wrong.
    input_lines = input.decode('utf-8').splitlines(True)

    # Extract the duplicate resource definitions, ignoring the last definition
    # which will take precedence and be left intact.
    duplicates = []
    for res_name, entries in tracker.resource_definitions.iteritems():
        if len(entries) > 1:
            duplicates += entries[:-1]

    # Sort the duplicates so that they are in order. That way we only do one pass.
    duplicates = sorted(duplicates, key=lambda x: x.start)

    last_line_no = 0
    last_col_no = 0
    output_lines = []
    current_line = ""
    for definition in duplicates:
        print "{0}:{1}:{2}: removing duplicate resource '{3}'".format(
                xml_path, definition.start[0] + 1, definition.start[1], definition.name)

        if last_line_no < definition.start[0]:
            # The next definition is on a new line, so write what we have
            # to the output.
            new_line = current_line + input_lines[last_line_no][last_col_no:]
            if not new_line.isspace():
                output_lines.append(new_line)
            current_line = ""
            last_col_no = 0
            last_line_no += 1

        # Copy all the lines up until this one.
        for line_to_copy in xrange(last_line_no, definition.start[0]):
            output_lines.append(input_lines[line_to_copy])

        # Add to the existing line we're building, by including the prefix of this line
        # and skipping the lines and characters until the end of this duplicate definition.
        last_line_no = definition.start[0]
        current_line += input_lines[last_line_no][last_col_no:definition.start[1]]
        last_line_no = definition.end[0]
        last_col_no = definition.end[1]

    new_line = current_line + input_lines[last_line_no][last_col_no:]
    if not new_line.isspace():
        output_lines.append(new_line)
    current_line = ""
    last_line_no += 1
    last_col_no = 0

    for line_to_copy in xrange(last_line_no, len(input_lines)):
        output_lines.append(input_lines[line_to_copy])

    if len(duplicates) > 0:
        print "{0}: writing deduped copy...".format(xml_path)

        # Write the lines to a temporary file.
        dirname, basename = os.path.split(xml_path)
        temp_name = ""
        with tempfile.NamedTemporaryFile(prefix=basename, dir=dirname, delete=False) as temp:
            temp_name = temp.name
            for line in output_lines:
                temp.write(line.encode('utf-8'))

        # Now rename that file to the original so we have an atomic write that is consistent.
        os.rename(temp.name, xml_path)

def enumerate_files(res_path):
    """Enumerates all files in the resource directory that are XML files and
       within a values-* subdirectory. These types of files end up compiled
       in the resources.arsc table of an APK.
    """
    values_directories = os.listdir(res_path)
    values_directories = filter(lambda f: f.startswith('values'), values_directories)
    values_directories = map(lambda f: os.path.join(res_path, f), values_directories)
    all_files = []
    for dir in values_directories:
        files = os.listdir(dir)
        files = filter(lambda f: f.endswith('.xml'), files)
        files = map(lambda f: os.path.join(dir, f), files)
        all_files += files
    return all_files

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print >> sys.stderr, "please specify a path to a resource directory"
        sys.exit(1)

    res_path = os.path.abspath(sys.argv[1])
    print "looking in {0} ...".format(res_path)

    for f in enumerate_files(res_path):
        print "checking {0} ...".format(f)
        remove_duplicates(f)