| /* |
| * arch/ia64/kernel/domain.c |
| * Architecture specific sched-domains builder. |
| * |
| * Copyright (C) 2004 Jesse Barnes |
| * Copyright (C) 2004 Silicon Graphics, Inc. |
| */ |
| |
| #include <linux/sched.h> |
| #include <linux/percpu.h> |
| #include <linux/slab.h> |
| #include <linux/cpumask.h> |
| #include <linux/init.h> |
| #include <linux/topology.h> |
| #include <linux/nodemask.h> |
| |
| #define SD_NODES_PER_DOMAIN 16 |
| |
| #ifdef CONFIG_NUMA |
| /** |
| * find_next_best_node - find the next node to include in a sched_domain |
| * @node: node whose sched_domain we're building |
| * @used_nodes: nodes already in the sched_domain |
| * |
| * Find the next node to include in a given scheduling domain. Simply |
| * finds the closest node not already in the @used_nodes map. |
| * |
| * Should use nodemask_t. |
| */ |
| static int __devinit find_next_best_node(int node, unsigned long *used_nodes) |
| { |
| int i, n, val, min_val, best_node = 0; |
| |
| min_val = INT_MAX; |
| |
| for (i = 0; i < MAX_NUMNODES; i++) { |
| /* Start at @node */ |
| n = (node + i) % MAX_NUMNODES; |
| |
| if (!nr_cpus_node(n)) |
| continue; |
| |
| /* Skip already used nodes */ |
| if (test_bit(n, used_nodes)) |
| continue; |
| |
| /* Simple min distance search */ |
| val = node_distance(node, n); |
| |
| if (val < min_val) { |
| min_val = val; |
| best_node = n; |
| } |
| } |
| |
| set_bit(best_node, used_nodes); |
| return best_node; |
| } |
| |
| /** |
| * sched_domain_node_span - get a cpumask for a node's sched_domain |
| * @node: node whose cpumask we're constructing |
| * @size: number of nodes to include in this span |
| * |
| * Given a node, construct a good cpumask for its sched_domain to span. It |
| * should be one that prevents unnecessary balancing, but also spreads tasks |
| * out optimally. |
| */ |
| static cpumask_t __devinit sched_domain_node_span(int node) |
| { |
| int i; |
| cpumask_t span, nodemask; |
| DECLARE_BITMAP(used_nodes, MAX_NUMNODES); |
| |
| cpus_clear(span); |
| bitmap_zero(used_nodes, MAX_NUMNODES); |
| |
| nodemask = node_to_cpumask(node); |
| cpus_or(span, span, nodemask); |
| set_bit(node, used_nodes); |
| |
| for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
| int next_node = find_next_best_node(node, used_nodes); |
| nodemask = node_to_cpumask(next_node); |
| cpus_or(span, span, nodemask); |
| } |
| |
| return span; |
| } |
| #endif |
| |
| /* |
| * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we |
| * can switch it on easily if needed. |
| */ |
| #ifdef CONFIG_SCHED_SMT |
| static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
| static struct sched_group sched_group_cpus[NR_CPUS]; |
| static int __devinit cpu_to_cpu_group(int cpu) |
| { |
| return cpu; |
| } |
| #endif |
| |
| static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
| static struct sched_group sched_group_phys[NR_CPUS]; |
| static int __devinit cpu_to_phys_group(int cpu) |
| { |
| #ifdef CONFIG_SCHED_SMT |
| return first_cpu(cpu_sibling_map[cpu]); |
| #else |
| return cpu; |
| #endif |
| } |
| |
| #ifdef CONFIG_NUMA |
| /* |
| * The init_sched_build_groups can't handle what we want to do with node |
| * groups, so roll our own. Now each node has its own list of groups which |
| * gets dynamically allocated. |
| */ |
| static DEFINE_PER_CPU(struct sched_domain, node_domains); |
| static struct sched_group *sched_group_nodes[MAX_NUMNODES]; |
| |
| static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); |
| static struct sched_group sched_group_allnodes[MAX_NUMNODES]; |
| |
| static int __devinit cpu_to_allnodes_group(int cpu) |
| { |
| return cpu_to_node(cpu); |
| } |
| #endif |
| |
| /* |
| * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
| */ |
| void __devinit arch_init_sched_domains(void) |
| { |
| int i; |
| cpumask_t cpu_default_map; |
| |
| /* |
| * Setup mask for cpus without special case scheduling requirements. |
| * For now this just excludes isolated cpus, but could be used to |
| * exclude other special cases in the future. |
| */ |
| cpus_complement(cpu_default_map, cpu_isolated_map); |
| cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); |
| |
| /* |
| * Set up domains. Isolated domains just stay on the dummy domain. |
| */ |
| for_each_cpu_mask(i, cpu_default_map) { |
| int group; |
| struct sched_domain *sd = NULL, *p; |
| cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); |
| |
| cpus_and(nodemask, nodemask, cpu_default_map); |
| |
| #ifdef CONFIG_NUMA |
| if (num_online_cpus() |
| > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { |
| sd = &per_cpu(allnodes_domains, i); |
| *sd = SD_ALLNODES_INIT; |
| sd->span = cpu_default_map; |
| group = cpu_to_allnodes_group(i); |
| sd->groups = &sched_group_allnodes[group]; |
| p = sd; |
| } else |
| p = NULL; |
| |
| sd = &per_cpu(node_domains, i); |
| *sd = SD_NODE_INIT; |
| sd->span = sched_domain_node_span(cpu_to_node(i)); |
| sd->parent = p; |
| cpus_and(sd->span, sd->span, cpu_default_map); |
| #endif |
| |
| p = sd; |
| sd = &per_cpu(phys_domains, i); |
| group = cpu_to_phys_group(i); |
| *sd = SD_CPU_INIT; |
| sd->span = nodemask; |
| sd->parent = p; |
| sd->groups = &sched_group_phys[group]; |
| |
| #ifdef CONFIG_SCHED_SMT |
| p = sd; |
| sd = &per_cpu(cpu_domains, i); |
| group = cpu_to_cpu_group(i); |
| *sd = SD_SIBLING_INIT; |
| sd->span = cpu_sibling_map[i]; |
| cpus_and(sd->span, sd->span, cpu_default_map); |
| sd->parent = p; |
| sd->groups = &sched_group_cpus[group]; |
| #endif |
| } |
| |
| #ifdef CONFIG_SCHED_SMT |
| /* Set up CPU (sibling) groups */ |
| for_each_cpu_mask(i, cpu_default_map) { |
| cpumask_t this_sibling_map = cpu_sibling_map[i]; |
| cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); |
| if (i != first_cpu(this_sibling_map)) |
| continue; |
| |
| init_sched_build_groups(sched_group_cpus, this_sibling_map, |
| &cpu_to_cpu_group); |
| } |
| #endif |
| |
| /* Set up physical groups */ |
| for (i = 0; i < MAX_NUMNODES; i++) { |
| cpumask_t nodemask = node_to_cpumask(i); |
| |
| cpus_and(nodemask, nodemask, cpu_default_map); |
| if (cpus_empty(nodemask)) |
| continue; |
| |
| init_sched_build_groups(sched_group_phys, nodemask, |
| &cpu_to_phys_group); |
| } |
| |
| #ifdef CONFIG_NUMA |
| init_sched_build_groups(sched_group_allnodes, cpu_default_map, |
| &cpu_to_allnodes_group); |
| |
| for (i = 0; i < MAX_NUMNODES; i++) { |
| /* Set up node groups */ |
| struct sched_group *sg, *prev; |
| cpumask_t nodemask = node_to_cpumask(i); |
| cpumask_t domainspan; |
| cpumask_t covered = CPU_MASK_NONE; |
| int j; |
| |
| cpus_and(nodemask, nodemask, cpu_default_map); |
| if (cpus_empty(nodemask)) |
| continue; |
| |
| domainspan = sched_domain_node_span(i); |
| cpus_and(domainspan, domainspan, cpu_default_map); |
| |
| sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); |
| sched_group_nodes[i] = sg; |
| for_each_cpu_mask(j, nodemask) { |
| struct sched_domain *sd; |
| sd = &per_cpu(node_domains, j); |
| sd->groups = sg; |
| if (sd->groups == NULL) { |
| /* Turn off balancing if we have no groups */ |
| sd->flags = 0; |
| } |
| } |
| if (!sg) { |
| printk(KERN_WARNING |
| "Can not alloc domain group for node %d\n", i); |
| continue; |
| } |
| sg->cpu_power = 0; |
| sg->cpumask = nodemask; |
| cpus_or(covered, covered, nodemask); |
| prev = sg; |
| |
| for (j = 0; j < MAX_NUMNODES; j++) { |
| cpumask_t tmp, notcovered; |
| int n = (i + j) % MAX_NUMNODES; |
| |
| cpus_complement(notcovered, covered); |
| cpus_and(tmp, notcovered, cpu_default_map); |
| cpus_and(tmp, tmp, domainspan); |
| if (cpus_empty(tmp)) |
| break; |
| |
| nodemask = node_to_cpumask(n); |
| cpus_and(tmp, tmp, nodemask); |
| if (cpus_empty(tmp)) |
| continue; |
| |
| sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); |
| if (!sg) { |
| printk(KERN_WARNING |
| "Can not alloc domain group for node %d\n", j); |
| break; |
| } |
| sg->cpu_power = 0; |
| sg->cpumask = tmp; |
| cpus_or(covered, covered, tmp); |
| prev->next = sg; |
| prev = sg; |
| } |
| prev->next = sched_group_nodes[i]; |
| } |
| #endif |
| |
| /* Calculate CPU power for physical packages and nodes */ |
| for_each_cpu_mask(i, cpu_default_map) { |
| int power; |
| struct sched_domain *sd; |
| #ifdef CONFIG_SCHED_SMT |
| sd = &per_cpu(cpu_domains, i); |
| power = SCHED_LOAD_SCALE; |
| sd->groups->cpu_power = power; |
| #endif |
| |
| sd = &per_cpu(phys_domains, i); |
| power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * |
| (cpus_weight(sd->groups->cpumask)-1) / 10; |
| sd->groups->cpu_power = power; |
| |
| #ifdef CONFIG_NUMA |
| sd = &per_cpu(allnodes_domains, i); |
| if (sd->groups) { |
| power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * |
| (cpus_weight(sd->groups->cpumask)-1) / 10; |
| sd->groups->cpu_power = power; |
| } |
| #endif |
| } |
| |
| #ifdef CONFIG_NUMA |
| for (i = 0; i < MAX_NUMNODES; i++) { |
| struct sched_group *sg = sched_group_nodes[i]; |
| int j; |
| |
| if (sg == NULL) |
| continue; |
| next_sg: |
| for_each_cpu_mask(j, sg->cpumask) { |
| struct sched_domain *sd; |
| int power; |
| |
| sd = &per_cpu(phys_domains, j); |
| if (j != first_cpu(sd->groups->cpumask)) { |
| /* |
| * Only add "power" once for each |
| * physical package. |
| */ |
| continue; |
| } |
| power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * |
| (cpus_weight(sd->groups->cpumask)-1) / 10; |
| |
| sg->cpu_power += power; |
| } |
| sg = sg->next; |
| if (sg != sched_group_nodes[i]) |
| goto next_sg; |
| } |
| #endif |
| |
| /* Attach the domains */ |
| for_each_online_cpu(i) { |
| struct sched_domain *sd; |
| #ifdef CONFIG_SCHED_SMT |
| sd = &per_cpu(cpu_domains, i); |
| #else |
| sd = &per_cpu(phys_domains, i); |
| #endif |
| cpu_attach_domain(sd, i); |
| } |
| } |
| |
| void __devinit arch_destroy_sched_domains(void) |
| { |
| #ifdef CONFIG_NUMA |
| int i; |
| for (i = 0; i < MAX_NUMNODES; i++) { |
| struct sched_group *oldsg, *sg = sched_group_nodes[i]; |
| if (sg == NULL) |
| continue; |
| sg = sg->next; |
| next_sg: |
| oldsg = sg; |
| sg = sg->next; |
| kfree(oldsg); |
| if (oldsg != sched_group_nodes[i]) |
| goto next_sg; |
| sched_group_nodes[i] = NULL; |
| } |
| #endif |
| } |
| |