#!/usr/bin/env python

import os
import os.path
import sys
import tempfile
import xml.parsers.expat

"""
Scans each resource file in res/values/ looking for duplicates.
All but the last occurrence of resource definition are removed.
This creates no semantic changes, the resulting APK when built
should contain the same definition.
"""

class Duplicate:
    """A small struct to maintain the positions of a Duplicate resource definition."""
    def __init__(self, name, product, depth, start, end):
        self.name = name
        self.product = product
        self.depth = depth
        self.start = start
        self.end = end

class ResourceDefinitionLocator:
    """Callback class for xml.parsers.expat which records resource definitions and their
    locations.
    """
    def __init__(self, parser):
        self.resource_definitions = {}
        self._parser = parser
        self._depth = 0
        self._current_resource = None

    def start_element(self, tag_name, attrs):
        self._depth += 1
        if self._depth == 2 and tag_name not in ["public", "java-symbol", "eat-comment", "skip"]:
            resource_name = None
            product = ""
            try:
                product = attrs["product"]
            except KeyError:
                pass

            if tag_name == "item":
                resource_name = "{0}/{1}".format(attrs["type"], attrs["name"])
            else:
                resource_name = "{0}/{1}".format(tag_name, attrs["name"])
            self._current_resource = Duplicate(
                    resource_name,
                    product,
                    self._depth,
                    (self._parser.CurrentLineNumber - 1, self._parser.CurrentColumnNumber),
                    None)

    def end_element(self, tag_name):
        if self._current_resource and self._depth == self._current_resource.depth:
            # Record the end position of the element, which is the length of the name
            # plus the </> symbols (len("</>") == 3).
            self._current_resource.end = (self._parser.CurrentLineNumber - 1,
                    self._parser.CurrentColumnNumber + 3 + len(tag_name))
            key_name = "{0}:{1}".format(self._current_resource.name,
                    self._current_resource.product)
            try:
                self.resource_definitions[key_name] += [self._current_resource]
            except KeyError:
                self.resource_definitions[key_name] = [self._current_resource]
            self._current_resource = None
        self._depth -= 1

def remove_duplicates(xml_path):
    """Reads the input file and generates an output file with any duplicate
    resources removed, keeping the last occurring definition and removing
    the others. The output is written to a temporary and then renamed
    to the original file name.
    """
    input = ""
    with open(xml_path) as fin:
        input = fin.read()

    parser = xml.parsers.expat.ParserCreate("utf-8")
    parser.returns_unicode = True
    tracker = ResourceDefinitionLocator(parser)
    parser.StartElementHandler = tracker.start_element
    parser.EndElementHandler = tracker.end_element
    parser.Parse(input)

    # Treat the input as UTF-8 or else column numbers will be wrong.
    input_lines = input.decode('utf-8').splitlines(True)

    # Extract the duplicate resource definitions, ignoring the last definition
    # which will take precedence and be left intact.
    duplicates = []
    for res_name, entries in tracker.resource_definitions.iteritems():
        if len(entries) > 1:
            duplicates += entries[:-1]

    # Sort the duplicates so that they are in order. That way we only do one pass.
    duplicates = sorted(duplicates, key=lambda x: x.start)

    last_line_no = 0
    last_col_no = 0
    output_lines = []
    current_line = ""
    for definition in duplicates:
        print "{0}:{1}:{2}: removing duplicate resource '{3}'".format(
                xml_path, definition.start[0] + 1, definition.start[1], definition.name)

        if last_line_no < definition.start[0]:
            # The next definition is on a new line, so write what we have
            # to the output.
            new_line = current_line + input_lines[last_line_no][last_col_no:]
            if not new_line.isspace():
                output_lines.append(new_line)
            current_line = ""
            last_col_no = 0
            last_line_no += 1

        # Copy all the lines up until this one.
        for line_to_copy in xrange(last_line_no, definition.start[0]):
            output_lines.append(input_lines[line_to_copy])

        # Add to the existing line we're building, by including the prefix of this line
        # and skipping the lines and characters until the end of this duplicate definition.
        last_line_no = definition.start[0]
        current_line += input_lines[last_line_no][last_col_no:definition.start[1]]
        last_line_no = definition.end[0]
        last_col_no = definition.end[1]

    new_line = current_line + input_lines[last_line_no][last_col_no:]
    if not new_line.isspace():
        output_lines.append(new_line)
    current_line = ""
    last_line_no += 1
    last_col_no = 0

    for line_to_copy in xrange(last_line_no, len(input_lines)):
        output_lines.append(input_lines[line_to_copy])

    if len(duplicates) > 0:
        print "{0}: writing deduped copy...".format(xml_path)

        # Write the lines to a temporary file.
        dirname, basename = os.path.split(xml_path)
        temp_name = ""
        with tempfile.NamedTemporaryFile(prefix=basename, dir=dirname, delete=False) as temp:
            temp_name = temp.name
            for line in output_lines:
                temp.write(line.encode('utf-8'))

        # Now rename that file to the original so we have an atomic write that is consistent.
        os.rename(temp.name, xml_path)

def enumerate_files(res_path):
    """Enumerates all files in the resource directory that are XML files and
       within a values-* subdirectory. These types of files end up compiled
       in the resources.arsc table of an APK.
    """
    values_directories = os.listdir(res_path)
    values_directories = filter(lambda f: f.startswith('values'), values_directories)
    values_directories = map(lambda f: os.path.join(res_path, f), values_directories)
    all_files = []
    for dir in values_directories:
        files = os.listdir(dir)
        files = filter(lambda f: f.endswith('.xml'), files)
        files = map(lambda f: os.path.join(dir, f), files)
        all_files += files
    return all_files

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print >> sys.stderr, "please specify a path to a resource directory"
        sys.exit(1)

    res_path = os.path.abspath(sys.argv[1])
    print "looking in {0} ...".format(res_path)

    for f in enumerate_files(res_path):
        print "checking {0} ...".format(f)
        remove_duplicates(f)