#!/usr/bin/env python import os import os.path import sys import tempfile import xml.parsers.expat """ Scans each resource file in res/values/ looking for duplicates. All but the last occurrence of resource definition are removed. This creates no semantic changes, the resulting APK when built should contain the same definition. """ class Duplicate: """A small struct to maintain the positions of a Duplicate resource definition.""" def __init__(self, name, product, depth, start, end): self.name = name self.product = product self.depth = depth self.start = start self.end = end class ResourceDefinitionLocator: """Callback class for xml.parsers.expat which records resource definitions and their locations. """ def __init__(self, parser): self.resource_definitions = {} self._parser = parser self._depth = 0 self._current_resource = None def start_element(self, tag_name, attrs): self._depth += 1 if self._depth == 2 and tag_name not in ["public", "java-symbol", "eat-comment", "skip"]: resource_name = None product = "" try: product = attrs["product"] except KeyError: pass if tag_name == "item": resource_name = "{0}/{1}".format(attrs["type"], attrs["name"]) else: resource_name = "{0}/{1}".format(tag_name, attrs["name"]) self._current_resource = Duplicate( resource_name, product, self._depth, (self._parser.CurrentLineNumber - 1, self._parser.CurrentColumnNumber), None) def end_element(self, tag_name): if self._current_resource and self._depth == self._current_resource.depth: # Record the end position of the element, which is the length of the name # plus the symbols (len("") == 3). self._current_resource.end = (self._parser.CurrentLineNumber - 1, self._parser.CurrentColumnNumber + 3 + len(tag_name)) key_name = "{0}:{1}".format(self._current_resource.name, self._current_resource.product) try: self.resource_definitions[key_name] += [self._current_resource] except KeyError: self.resource_definitions[key_name] = [self._current_resource] self._current_resource = None self._depth -= 1 def remove_duplicates(xml_path): """Reads the input file and generates an output file with any duplicate resources removed, keeping the last occurring definition and removing the others. The output is written to a temporary and then renamed to the original file name. """ input = "" with open(xml_path) as fin: input = fin.read() parser = xml.parsers.expat.ParserCreate("utf-8") parser.returns_unicode = True tracker = ResourceDefinitionLocator(parser) parser.StartElementHandler = tracker.start_element parser.EndElementHandler = tracker.end_element parser.Parse(input) # Treat the input as UTF-8 or else column numbers will be wrong. input_lines = input.decode('utf-8').splitlines(True) # Extract the duplicate resource definitions, ignoring the last definition # which will take precedence and be left intact. duplicates = [] for res_name, entries in tracker.resource_definitions.iteritems(): if len(entries) > 1: duplicates += entries[:-1] # Sort the duplicates so that they are in order. That way we only do one pass. duplicates = sorted(duplicates, key=lambda x: x.start) last_line_no = 0 last_col_no = 0 output_lines = [] current_line = "" for definition in duplicates: print "{0}:{1}:{2}: removing duplicate resource '{3}'".format( xml_path, definition.start[0] + 1, definition.start[1], definition.name) if last_line_no < definition.start[0]: # The next definition is on a new line, so write what we have # to the output. new_line = current_line + input_lines[last_line_no][last_col_no:] if not new_line.isspace(): output_lines.append(new_line) current_line = "" last_col_no = 0 last_line_no += 1 # Copy all the lines up until this one. for line_to_copy in xrange(last_line_no, definition.start[0]): output_lines.append(input_lines[line_to_copy]) # Add to the existing line we're building, by including the prefix of this line # and skipping the lines and characters until the end of this duplicate definition. last_line_no = definition.start[0] current_line += input_lines[last_line_no][last_col_no:definition.start[1]] last_line_no = definition.end[0] last_col_no = definition.end[1] new_line = current_line + input_lines[last_line_no][last_col_no:] if not new_line.isspace(): output_lines.append(new_line) current_line = "" last_line_no += 1 last_col_no = 0 for line_to_copy in xrange(last_line_no, len(input_lines)): output_lines.append(input_lines[line_to_copy]) if len(duplicates) > 0: print "{0}: writing deduped copy...".format(xml_path) # Write the lines to a temporary file. dirname, basename = os.path.split(xml_path) temp_name = "" with tempfile.NamedTemporaryFile(prefix=basename, dir=dirname, delete=False) as temp: temp_name = temp.name for line in output_lines: temp.write(line.encode('utf-8')) # Now rename that file to the original so we have an atomic write that is consistent. os.rename(temp.name, xml_path) def enumerate_files(res_path): """Enumerates all files in the resource directory that are XML files and within a values-* subdirectory. These types of files end up compiled in the resources.arsc table of an APK. """ values_directories = os.listdir(res_path) values_directories = filter(lambda f: f.startswith('values'), values_directories) values_directories = map(lambda f: os.path.join(res_path, f), values_directories) all_files = [] for dir in values_directories: files = os.listdir(dir) files = filter(lambda f: f.endswith('.xml'), files) files = map(lambda f: os.path.join(dir, f), files) all_files += files return all_files if __name__ == '__main__': if len(sys.argv) < 2: print >> sys.stderr, "please specify a path to a resource directory" sys.exit(1) res_path = os.path.abspath(sys.argv[1]) print "looking in {0} ...".format(res_path) for f in enumerate_files(res_path): print "checking {0} ...".format(f) remove_duplicates(f)