@@ -860,6 +860,75 @@ void __init dump_numa_cpu_topology(void)
}
}
+/*
+ * Scheduler expects unique number of node distances to be available at
+ * boot. It uses node distance to calculate this unique node distances. On
+ * POWER, node distances for offline nodes is not available. However, POWER
+ * already knows unique possible node distances. Fake the offline node's
+ * distance_lookup_table entries so that all possible node distances are
+ * updated.
+ */
+void __init fake_update_distance_lookup_table(void)
+{
+ unsigned long distance_map;
+ int i, nr_levels, nr_depth, node;
+
+ if (!numa_enabled)
+ return;
+
+ if (!form1_affinity)
+ return;
+
+ /*
+ * distance_ref_points_depth lists the unique numa domains
+ * available. However it ignore LOCAL_DISTANCE. So add +1
+ * to get the actual number of unique distances.
+ */
+ nr_depth = distance_ref_points_depth + 1;
+
+ WARN_ON(nr_depth > sizeof(distance_map));
+
+ bitmap_zero(&distance_map, nr_depth);
+ bitmap_set(&distance_map, 0, 1);
+
+ for_each_online_node(node) {
+ int nd, distance = LOCAL_DISTANCE;
+
+ if (node == first_online_node)
+ continue;
+
+ nd = __node_distance(node, first_online_node);
+ for (i = 0; i < nr_depth; i++, distance *= 2) {
+ if (distance == nd) {
+ bitmap_set(&distance_map, i, 1);
+ break;
+ }
+ }
+ nr_levels = bitmap_weight(&distance_map, nr_depth);
+ if (nr_levels == nr_depth)
+ return;
+ }
+
+ for_each_node(node) {
+ if (node_online(node))
+ continue;
+
+ i = find_first_zero_bit(&distance_map, nr_depth);
+ if (i >= nr_depth || i == 0) {
+ pr_warn("Levels(%d) not matching levels(%d)", nr_levels, nr_depth);
+ return;
+ }
+
+ bitmap_set(&distance_map, i, 1);
+ while (i--)
+ distance_lookup_table[node][i] = node;
+
+ nr_levels = bitmap_weight(&distance_map, nr_depth);
+ if (nr_levels == nr_depth)
+ return;
+ }
+}
+
/* Initialize NODE_DATA for a node on the local memory */
static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
{
@@ -975,6 +1044,7 @@ void __init mem_topology_setup(void)
*/
numa_setup_cpu(cpu);
}
+ fake_update_distance_lookup_table();
}
void __init initmem_init(void)
Currently scheduler populates the distance map by looking at distance of each node from all other nodes. This should work for most architectures and platforms. Scheduler expects unique number of node distances to be available at boot. It uses node distance to calculate this unique node distances. On Power Servers, node distances for offline nodes is not available. However, Power Servers already knows unique possible node distances. Fake the offline node's distance_lookup_table entries so that all possible node distances are updated. For example distance info from numactl from a fully populated 8 node system at boot may look like this. node distances: node 0 1 2 3 4 5 6 7 0: 10 20 40 40 40 40 40 40 1: 20 10 40 40 40 40 40 40 2: 40 40 10 20 40 40 40 40 3: 40 40 20 10 40 40 40 40 4: 40 40 40 40 10 20 40 40 5: 40 40 40 40 20 10 40 40 6: 40 40 40 40 40 40 10 20 7: 40 40 40 40 40 40 20 10 However the same system when only two nodes are online at boot, then distance info from numactl will look like node distances: node 0 1 0: 10 20 1: 20 10 It may be implementation dependent on what node_distance(0,3) where node 0 is online and node 3 is offline. In Power Servers case, it returns LOCAL_DISTANCE(10). Here at boot the scheduler would assume that the max distance between nodes is 20. However that would not be true. When Nodes are onlined and CPUs from those nodes are hotplugged, the max node distance would be 40. However this only needs to be done if the number of unique node distances that can be computed for online nodes is less than the number of possible unique node distances as represented by distance_ref_points_depth. When the node is actually onlined, distance_lookup_table will be updated with actual entries. Cc: LKML <linux-kernel@vger.kernel.org> Cc: linuxppc-dev@lists.ozlabs.org Cc: Nathan Lynch <nathanl@linux.ibm.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Ingo Molnar <mingo@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Valentin Schneider <valentin.schneider@arm.com> Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com> Cc: Dietmar Eggemann <dietmar.eggemann@arm.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Vincent Guittot <vincent.guittot@linaro.org> Cc: Rik van Riel <riel@surriel.com> Cc: Geetika Moolchandani <Geetika.Moolchandani1@ibm.com> Cc: Laurent Dufour <ldufour@linux.ibm.com> Reported-by: Geetika Moolchandani <Geetika.Moolchandani1@ibm.com> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> --- Changelog v1->v2: Move to a Powerpc specific solution as suggested by Peter and Valentin arch/powerpc/mm/numa.c | 70 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+)