========================================================================================================================
<snip>
length=128: 12.10 ( -1.76%) 13.19 ( 7.06%) 12.32
length=144: 11.32 ( -0.45%) 14.07 ( 23.71%) 11.38
length=129: 12.33 ( -0.71%) 13.10 ( 5.54%) 12.41
length=143: 11.22 ( -0.77%) 14.00 ( 23.80%) 11.31
length=130: 12.16 ( -0.77%) 13.54 ( 10.55%) 12.25
length=142: 11.38 ( -0.32%) 14.07 ( 23.24%) 11.42
length=131: 12.07 ( -0.78%) 13.47 ( 10.76%) 12.16
length=141: 11.32 ( -0.21%) 13.99 ( 23.37%) 11.34
length=132: 12.06 ( -0.32%) 13.58 ( 12.24%) 12.10
length=140: 11.53 ( 0.32%) 13.96 ( 21.45%) 11.49
length=133: 12.00 ( 0.17%) 13.57 ( 13.31%) 11.98
length=139: 11.49 ( 0.17%) 13.87 ( 20.84%) 11.47
length=134: 12.02 ( 0.14%) 13.64 ( 13.68%) 12.00
length=138: 11.56 ( -0.51%) 13.65 ( 17.48%) 11.62
length=135: 11.62 ( -1.66%) 13.47 ( 13.99%) 11.82
length=137: 11.50 ( -0.59%) 13.60 ( 17.59%) 11.57
length=256: 19.90 ( -1.66%) 19.90 ( -1.66%) 20.24
length=272: 17.55 ( -2.13%) 19.39 ( 8.13%) 17.93
length=257: 19.94 ( -1.94%) 20.01 ( -1.59%) 20.33
length=271: 17.73 ( -2.10%) 19.33 ( 6.75%) 18.11
length=258: 19.49 ( -1.88%) 19.71 ( -0.74%) 19.86
length=270: 17.94 ( -1.82%) 19.38 ( 6.02%) 18.28
length=259: 19.30 ( -2.11%) 19.68 ( -0.20%) 19.72
length=269: 17.88 ( -2.19%) 19.15 ( 4.79%) 18.28
length=260: 19.28 ( -1.88%) 19.60 ( -0.25%) 19.65
length=268: 17.91 ( -3.53%) 19.17 ( 3.24%) 18.57
length=261: 18.98 ( -1.78%) 19.31 ( -0.08%) 19.33
length=267: 17.96 ( -1.79%) 19.12 ( 4.56%) 18.29
length=262: 18.63 ( -1.99%) 19.30 ( 1.52%) 19.01
length=266: 18.20 ( -1.33%) 19.22 ( 4.18%) 18.45
length=263: 18.46 ( -1.86%) 19.20 ( 2.07%) 18.81
length=265: 18.16 ( -1.87%) 19.22 ( 3.87%) 18.51
length=512: 22.49 ( -0.98%) 23.52 ( 3.56%) 22.71
length=528: 20.71 ( -0.21%) 22.93 ( 10.53%) 20.75
length=513: 22.31 ( -0.77%) 23.37 ( 3.97%) 22.48
length=527: 20.59 ( -0.49%) 22.60 ( 9.24%) 20.69
length=514: 22.21 ( -1.31%) 23.33 ( 3.65%) 22.51
length=526: 20.93 ( -0.48%) 22.80 ( 8.42%) 21.03
length=515: 21.86 ( -0.60%) 22.96 ( 4.39%) 22.00
length=525: 20.61 ( 0.02%) 22.73 ( 10.34%) 20.60
length=516: 21.85 ( -2.07%) 22.91 ( 2.67%) 22.31
length=524: 21.04 ( -0.65%) 22.74 ( 7.35%) 21.18
length=517: 21.33 ( -1.44%) 22.90 ( 5.83%) 21.64
length=523: 20.76 ( -0.54%) 22.78 ( 9.11%) 20.88
length=518: 21.65 ( -0.55%) 23.01 ( 5.70%) 21.77
length=522: 21.22 ( -0.46%) 22.87 ( 7.29%) 21.32
length=519: 21.20 ( -0.82%) 22.78 ( 6.58%) 21.38
length=521: 20.99 ( -0.89%) 22.72 ( 7.26%) 21.18
length=1024: 23.42 ( -1.62%) 22.16 ( -6.92%) 23.80
length=1040: 22.53 ( 0.53%) 21.57 ( -3.76%) 22.41
length=1025: 23.02 ( -0.41%) 21.18 ( -8.37%) 23.11
length=1039: 21.85 ( 1.18%) 21.21 ( -1.78%) 21.59
length=1026: 23.36 ( -0.27%) 21.88 ( -6.61%) 23.43
length=1038: 22.64 ( 1.25%) 21.40 ( -4.33%) 22.36
length=1027: 22.68 ( -0.92%) 21.40 ( -6.49%) 22.89
length=1037: 22.02 ( 0.01%) 21.27 ( -3.42%) 22.02
length=1028: 23.09 ( -0.13%) 21.89 ( -5.32%) 23.12
length=1036: 22.74 ( 0.57%) 21.60 ( -4.45%) 22.61
length=1029: 22.56 ( -0.39%) 21.35 ( -5.73%) 22.65
length=1035: 22.20 ( 0.13%) 21.34 ( -3.78%) 22.17
length=1030: 23.06 ( -0.30%) 21.38 ( -7.56%) 23.13
length=1034: 22.80 ( 0.33%) 21.44 ( -5.67%) 22.73
length=1031: 22.40 ( -1.06%) 21.43 ( -5.33%) 22.64
length=1033: 22.26 ( -0.54%) 21.41 ( -4.32%) 22.38
length=2048: 24.12 ( -0.09%) 25.51 ( 5.65%) 24.14
length=2064: 23.57 ( 0.51%) 25.14 ( 7.19%) 23.45
length=2049: 23.46 ( -0.22%) 25.04 ( 6.51%) 23.51
length=2063: 22.94 ( 0.64%) 24.62 ( 8.00%) 22.79
length=2050: 23.99 ( -0.58%) 25.21 ( 4.47%) 24.13
length=2062: 23.74 ( 0.50%) 24.81 ( 5.00%) 23.63
length=2051: 23.27 ( -0.62%) 24.66 ( 5.28%) 23.42
length=2061: 22.97 ( -0.34%) 24.27 ( 5.28%) 23.05
length=2052: 23.87 ( -0.53%) 25.07 ( 4.46%) 24.00
length=2060: 23.70 ( 0.22%) 24.52 ( 3.68%) 23.65
length=2053: 23.13 ( -0.90%) 24.34 ( 4.30%) 23.34
length=2059: 22.95 ( -0.79%) 24.00 ( 3.76%) 23.13
length=2054: 23.76 ( -0.70%) 24.57 ( 2.68%) 23.93
length=2058: 23.69 ( 0.08%) 24.28 ( 2.58%) 23.67
length=2055: 23.16 ( -0.08%) 24.04 ( 3.74%) 23.18
length=2057: 23.11 ( -0.04%) 23.89 ( 3.34%) 23.12
length=4096: 24.43 ( 0.06%) 24.56 ( 0.59%) 24.42
length=4112: 24.07 ( -0.01%) 24.38 ( 1.30%) 24.07
length=4097: 23.69 ( -0.48%) 23.95 ( 0.62%) 23.80
length=4111: 23.40 ( 0.05%) 23.87 ( 2.08%) 23.39
length=4098: 24.42 ( -0.36%) 24.45 ( -0.21%) 24.51
length=4110: 24.18 ( 0.76%) 24.39 ( 1.61%) 24.00
length=4099: 23.64 ( -0.40%) 23.93 ( 0.85%) 23.73
length=4109: 23.30 ( -0.14%) 23.97 ( 2.72%) 23.33
length=4100: 24.13 ( -0.77%) 24.52 ( 0.85%) 24.32
length=4108: 23.93 ( -0.44%) 24.48 ( 1.84%) 24.04
length=4101: 23.43 ( -0.82%) 24.01 ( 1.64%) 23.62
length=4107: 23.17 ( -0.38%) 23.98 ( 3.09%) 23.26
length=4102: 24.05 ( -0.00%) 24.52 ( 1.97%) 24.05
length=4106: 23.80 ( -0.31%) 24.49 ( 2.58%) 23.87
length=4103: 23.34 ( -0.21%) 24.00 ( 2.65%) 23.38
length=4105: 23.24 ( -0.25%) 23.99 ( 2.95%) 23.30
length=8192: 24.44 ( 0.20%) 25.11 ( 2.94%) 24.39
length=8208: 24.11 ( -0.64%) 24.97 ( 2.92%) 24.27
length=8193: 23.82 ( 0.14%) 24.50 ( 2.99%) 23.79
length=8207: 23.57 ( -0.40%) 24.49 ( 3.48%) 23.66
length=8194: 24.46 ( 0.12%) 25.10 ( 2.76%) 24.43
length=8206: 24.24 ( -0.34%) 25.07 ( 3.08%) 24.32
length=8195: 23.70 ( -0.26%) 24.62 ( 3.61%) 23.76
length=8205: 23.57 ( -0.47%) 24.58 ( 3.79%) 23.68
length=8196: 24.32 ( -0.38%) 25.18 ( 3.16%) 24.41
length=8204: 24.22 ( -0.54%) 25.16 ( 3.32%) 24.36
length=8197: 23.68 ( -0.33%) 24.60 ( 3.57%) 23.75
length=8203: 23.60 ( -0.44%) 24.60 ( 3.76%) 23.71
length=8198: 24.31 ( 0.02%) 25.19 ( 3.61%) 24.31
length=8202: 24.34 ( 0.28%) 25.18 ( 3.72%) 24.28
length=8199: 23.74 ( 0.34%) 24.61 ( 4.01%) 23.66
length=8201: 23.70 ( 0.30%) 24.59 ( 4.06%) 23.63
length=16384: 19.99 ( -0.00%) 23.29 ( 16.49%) 19.99
length=16400: 19.75 ( -0.09%) 23.05 ( 16.58%) 19.77
length=16385: 20.14 ( 1.11%) 22.81 ( 14.51%) 19.92
length=16399: 21.32 ( 8.03%) 22.69 ( 15.00%) 19.73
length=16386: 19.97 ( -4.20%) 23.28 ( 11.68%) 20.85
length=16398: 19.80 ( -0.20%) 23.02 ( 16.03%) 19.84
length=16387: 19.89 ( -0.29%) 22.73 ( 13.91%) 19.95
length=16397: 20.50 ( 3.76%) 22.71 ( 14.98%) 19.75
length=16388: 19.92 ( -0.03%) 23.26 ( 16.72%) 19.93
length=16396: 19.84 ( 0.09%) 23.18 ( 16.95%) 19.82
length=16389: 19.85 ( -0.12%) 22.71 ( 14.27%) 19.87
length=16395: 19.78 ( -0.21%) 22.73 ( 14.70%) 19.82
length=16390: 19.84 ( -0.54%) 23.24 ( 16.51%) 19.95
length=16394: 19.86 ( -7.35%) 23.20 ( 8.27%) 21.43
length=16391: 19.83 ( -0.29%) 22.60 ( 13.67%) 19.89
length=16393: 19.80 ( -0.17%) 22.66 ( 14.28%) 19.83
length=32768: 18.04 ( 5.02%) 21.44 ( 24.80%) 17.18
length=32784: 14.93 ( -6.16%) 21.38 ( 34.35%) 15.91
length=32769: 16.36 ( 0.51%) 21.01 ( 29.06%) 16.28
length=32783: 14.93 ( -6.17%) 20.96 ( 31.72%) 15.91
length=32770: 15.90 ( -0.07%) 21.43 ( 34.70%) 15.91
length=32782: 14.98 ( -6.58%) 21.46 ( 33.79%) 16.04
length=32771: 15.45 ( -2.81%) 21.01 ( 32.11%) 15.90
length=32781: 15.27 (-32.11%) 21.03 ( -6.50%) 22.49
length=32772: 14.97 ( -6.11%) 21.38 ( 34.12%) 15.94
length=32780: 19.16 ( 7.70%) 21.46 ( 20.62%) 17.79
length=32773: 14.96 ( -6.08%) 20.95 ( 31.54%) 15.93
length=32779: 16.82 ( 1.44%) 21.04 ( 26.84%) 16.58
length=32774: 14.96 ( -6.10%) 21.37 ( 34.16%) 15.93
length=32778: 14.89 ( -8.88%) 21.46 ( 31.37%) 16.34
length=32775: 15.01 ( -5.75%) 20.96 ( 31.64%) 15.92
length=32777: 15.89 ( -0.01%) 21.04 ( 32.35%) 15.90
length=65536: 14.85 ( -5.68%) 21.44 ( 36.13%) 15.75
length=65552: 15.59 ( -1.35%) 21.50 ( 36.07%) 15.80
length=65537: 14.89 (-35.20%) 21.01 ( -8.56%) 22.98
length=65551: 14.83 ( -5.99%) 21.00 ( 33.12%) 15.78
length=65538: 15.88 (-18.88%) 21.42 ( 9.41%) 19.58
length=65550: 14.85 ( -6.00%) 21.39 ( 35.36%) 15.80
length=65539: 17.75 ( 2.67%) 21.01 ( 21.52%) 17.29
length=65549: 14.84 ( -6.27%) 20.99 ( 32.64%) 15.83
length=65540: 16.49 ( 1.45%) 21.42 ( 31.78%) 16.26
length=65548: 14.86 ( -6.00%) 21.41 ( 35.46%) 15.80
length=65541: 15.87 ( -0.17%) 21.01 ( 32.22%) 15.89
length=65547: 14.84 ( -6.11%) 21.09 ( 33.45%) 15.80
length=65542: 15.74 ( 0.15%) 21.43 ( 36.32%) 15.72
length=65546: 14.86 (-36.52%) 21.50 ( -8.13%) 23.41
length=65543: 14.79 ( -5.99%) 20.92 ( 33.01%) 15.73
length=65545: 15.75 (-21.17%) 21.09 ( 5.54%) 19.98
<snip>
length=128: 12.16 ( -1.84%) 13.22 ( 6.70%) 12.39
length=144: 11.39 ( -0.26%) 14.12 ( 23.62%) 11.42
length=129: 12.42 ( -0.72%) 13.23 ( 5.70%) 12.51
length=143: 11.26 ( -0.83%) 14.01 ( 23.35%) 11.36
length=130: 12.23 ( -0.73%) 13.53 ( 9.83%) 12.32
length=142: 11.45 ( -0.14%) 14.09 ( 22.84%) 11.47
length=131: 12.14 ( -0.56%) 13.43 ( 9.97%) 12.21
length=141: 11.39 ( -0.36%) 14.02 ( 22.63%) 11.44
length=132: 12.13 ( -0.58%) 13.56 ( 11.10%) 12.20
length=140: 11.55 ( 0.02%) 14.00 ( 21.23%) 11.55
length=133: 11.99 ( -0.44%) 13.55 ( 12.45%) 12.05
length=139: 11.57 ( 0.15%) 13.88 ( 20.18%) 11.55
length=134: 12.10 ( 0.23%) 13.62 ( 12.79%) 12.07
length=138: 11.68 ( -0.15%) 13.63 ( 16.56%) 11.69
length=135: 11.71 ( -1.01%) 13.44 ( 13.59%) 11.83
length=137: 11.58 ( -0.99%) 13.62 ( 16.43%) 11.70
length=256: 19.97 ( -1.69%) 19.93 ( -1.88%) 20.31
length=272: 17.60 ( -1.70%) 19.41 ( 8.40%) 17.90
length=257: 20.02 ( -1.35%) 20.09 ( -1.02%) 20.30
length=271: 17.68 ( -1.95%) 19.36 ( 7.38%) 18.03
length=258: 19.58 ( -2.07%) 19.73 ( -1.31%) 20.00
length=270: 17.92 ( -1.83%) 19.45 ( 6.56%) 18.25
length=259: 19.45 ( -1.60%) 19.63 ( -0.65%) 19.76
length=269: 17.94 ( -2.05%) 19.17 ( 4.65%) 18.32
length=260: 19.39 ( -1.37%) 19.68 ( 0.11%) 19.66
length=268: 17.99 ( -3.19%) 19.13 ( 2.94%) 18.58
length=261: 19.07 ( -1.59%) 19.33 ( -0.23%) 19.38
length=267: 18.00 ( -1.52%) 19.20 ( 5.03%) 18.28
length=262: 18.74 ( -1.62%) 19.25 ( 1.02%) 19.05
length=266: 18.21 ( -1.30%) 19.23 ( 4.24%) 18.45
length=263: 18.49 ( -1.85%) 19.21 ( 1.98%) 18.84
length=265: 18.27 ( -1.73%) 19.18 ( 3.15%) 18.59
length=512: 22.47 ( -1.50%) 23.52 ( 3.11%) 22.81
length=528: 20.61 ( -0.73%) 22.93 ( 10.47%) 20.76
length=513: 22.25 ( -0.71%) 23.47 ( 4.74%) 22.41
length=527: 20.57 ( -0.48%) 22.60 ( 9.33%) 20.67
length=514: 22.30 ( -0.97%) 23.24 ( 3.22%) 22.51
length=526: 20.92 ( -0.72%) 22.80 ( 8.18%) 21.08
length=515: 21.79 ( -1.17%) 22.96 ( 4.12%) 22.05
length=525: 20.41 ( -1.18%) 22.73 ( 10.03%) 20.66
length=516: 21.85 ( -1.65%) 23.04 ( 3.74%) 22.21
length=524: 21.02 ( -0.65%) 22.73 ( 7.44%) 21.16
length=517: 21.39 ( -1.06%) 22.80 ( 5.47%) 21.62
length=523: 20.74 ( -0.92%) 22.78 ( 8.79%) 20.94
length=518: 21.55 ( -1.32%) 23.01 ( 5.34%) 21.84
length=522: 21.14 ( -0.80%) 22.87 ( 7.35%) 21.30
length=519: 21.20 ( -0.49%) 22.88 ( 7.41%) 21.30
length=521: 20.99 ( -0.85%) 22.73 ( 7.38%) 21.17
length=1024: 23.56 ( -1.10%) 22.05 ( -7.45%) 23.82
length=1040: 22.61 ( 0.20%) 21.43 ( -5.02%) 22.56
length=1025: 22.93 ( -1.78%) 21.20 ( -9.22%) 23.35
length=1039: 21.87 ( -0.87%) 21.16 ( -4.10%) 22.06
length=1026: 23.37 ( -0.50%) 22.15 ( -5.67%) 23.48
length=1038: 22.59 ( 0.01%) 21.40 ( -5.24%) 22.59
length=1027: 22.71 ( -1.14%) 21.48 ( -6.46%) 22.97
length=1037: 22.16 ( -0.38%) 21.31 ( -4.19%) 22.24
length=1028: 23.09 ( -1.13%) 21.92 ( -6.14%) 23.35
length=1036: 22.66 ( -0.48%) 21.64 ( -4.95%) 22.77
length=1029: 22.50 ( -0.97%) 21.37 ( -5.96%) 22.72
length=1035: 22.26 ( -0.19%) 21.41 ( -3.99%) 22.30
length=1030: 23.04 ( -0.35%) 21.35 ( -7.64%) 23.12
length=1034: 22.83 ( -0.35%) 21.41 ( -6.56%) 22.91
length=1031: 22.48 ( -0.76%) 21.44 ( -5.37%) 22.66
length=1033: 22.28 ( -0.97%) 21.24 ( -5.62%) 22.50
length=2048: 24.01 ( -0.97%) 25.51 ( 5.20%) 24.25
length=2064: 23.49 ( -0.23%) 25.18 ( 6.95%) 23.54
length=2049: 23.39 ( -0.92%) 25.04 ( 6.08%) 23.61
length=2063: 22.78 ( -0.43%) 24.63 ( 7.64%) 22.88
length=2050: 23.99 ( -0.15%) 25.38 ( 5.61%) 24.03
length=2062: 23.74 ( 0.84%) 24.92 ( 5.86%) 23.54
length=2051: 23.28 ( -0.24%) 24.75 ( 6.07%) 23.33
length=2061: 22.97 ( 0.05%) 24.37 ( 6.15%) 22.96
length=2052: 23.95 ( -0.19%) 24.97 ( 4.04%) 24.00
length=2060: 23.79 ( 0.54%) 24.43 ( 3.25%) 23.66
length=2053: 23.21 ( -0.56%) 24.25 ( 3.88%) 23.35
length=2059: 23.04 ( -0.38%) 23.84 ( 3.10%) 23.13
length=2054: 23.85 ( -0.34%) 24.51 ( 2.41%) 23.93
length=2058: 23.59 ( -0.81%) 24.28 ( 2.11%) 23.78
length=2055: 23.08 ( -0.85%) 24.01 ( 3.14%) 23.28
length=2057: 23.03 ( -0.78%) 23.82 ( 2.63%) 23.21
length=4096: 24.34 ( -0.76%) 24.56 ( 0.15%) 24.52
length=4112: 24.07 ( 0.31%) 24.47 ( 1.99%) 24.00
length=4097: 23.68 ( -0.20%) 24.05 ( 1.35%) 23.73
length=4111: 23.41 ( 0.48%) 23.96 ( 2.84%) 23.30
length=4098: 24.42 ( -0.00%) 24.56 ( 0.57%) 24.42
length=4110: 24.19 ( 1.10%) 24.48 ( 2.28%) 23.93
length=4099: 23.63 ( -0.46%) 23.89 ( 0.64%) 23.74
length=4109: 23.32 ( -0.11%) 23.87 ( 2.27%) 23.35
length=4100: 24.22 ( -0.41%) 24.43 ( 0.44%) 24.32
length=4108: 24.01 ( -0.17%) 24.39 ( 1.41%) 24.05
length=4101: 23.51 ( -0.45%) 23.93 ( 1.32%) 23.61
length=4107: 23.17 ( -0.86%) 23.98 ( 2.61%) 23.37
length=4102: 23.96 ( -0.69%) 24.52 ( 1.62%) 24.13
length=4106: 23.71 ( -1.02%) 24.50 ( 2.27%) 23.96
length=4103: 23.26 ( -0.94%) 24.00 ( 2.25%) 23.47
length=4105: 23.16 ( -0.72%) 23.99 ( 2.84%) 23.33
length=8192: 24.44 ( 0.91%) 25.21 ( 4.07%) 24.22
length=8208: 24.12 ( -0.24%) 25.15 ( 4.05%) 24.18
length=8193: 23.82 ( 0.54%) 24.63 ( 3.96%) 23.69
length=8207: 23.57 ( -0.06%) 24.59 ( 4.26%) 23.59
length=8194: 24.46 ( 0.54%) 25.21 ( 3.63%) 24.33
length=8206: 24.24 ( 0.09%) 25.17 ( 3.92%) 24.22
length=8195: 23.78 ( 0.03%) 24.50 ( 3.05%) 23.78
length=8205: 23.65 ( -0.15%) 24.47 ( 3.28%) 23.69
length=8196: 24.43 ( 0.03%) 25.06 ( 2.62%) 24.42
length=8204: 24.32 ( -0.19%) 25.04 ( 2.79%) 24.36
length=8197: 23.76 ( -0.02%) 24.49 ( 3.05%) 23.76
length=8203: 23.69 ( -0.09%) 24.49 ( 3.29%) 23.71
length=8198: 24.31 ( -0.43%) 25.19 ( 3.19%) 24.41
length=8202: 24.26 ( -0.48%) 25.18 ( 3.31%) 24.37
length=8199: 23.65 ( -0.42%) 24.61 ( 3.60%) 23.75
length=8201: 23.62 ( -0.44%) 24.59 ( 3.66%) 23.72
length=16384: 22.39 ( 10.22%) 23.33 ( 14.82%) 20.32
length=16400: 19.81 ( -8.87%) 23.17 ( 6.60%) 21.74
length=16385: 19.89 ( -0.35%) 22.74 ( 13.94%) 19.96
length=16399: 19.73 ( -0.13%) 22.64 ( 14.57%) 19.76
length=16386: 20.77 ( 3.64%) 23.31 ( 16.31%) 20.04
length=16398: 19.75 ( -3.49%) 23.19 ( 13.30%) 20.47
length=16387: 19.83 ( -0.30%) 22.84 ( 14.84%) 19.89
length=16397: 19.73 ( -0.17%) 22.66 ( 14.65%) 19.76
length=16388: 19.89 ( -0.36%) 23.29 ( 16.65%) 19.97
length=16396: 20.19 ( 1.72%) 23.21 ( 16.93%) 19.85
length=16389: 19.81 ( -0.04%) 22.82 ( 15.17%) 19.81
length=16395: 19.73 ( -0.34%) 22.67 ( 14.48%) 19.80
length=16390: 19.95 ( -0.19%) 23.18 ( 15.97%) 19.99
length=16394: 19.99 ( 0.44%) 23.23 ( 16.74%) 19.90
length=16391: 19.78 ( -0.22%) 22.80 ( 14.99%) 19.83
length=16393: 19.76 ( -0.10%) 22.78 ( 15.15%) 19.78
length=32768: 14.96 ( -6.58%) 21.36 ( 33.39%) 16.02
length=32784: 20.12 (-11.22%) 21.46 ( -5.30%) 22.66
length=32769: 14.94 ( -6.65%) 20.94 ( 30.91%) 16.00
length=32783: 16.21 (-32.41%) 21.03 (-12.27%) 23.97
length=32770: 19.88 ( 9.30%) 21.44 ( 17.89%) 18.19
length=32782: 14.96 ( -6.17%) 21.39 ( 34.13%) 15.94
length=32771: 16.97 ( 1.87%) 21.01 ( 26.14%) 16.66
length=32781: 14.95 ( -6.21%) 20.97 ( 31.54%) 15.94
length=32772: 16.12 ( 0.90%) 21.48 ( 34.47%) 15.97
length=32780: 14.97 ( -5.98%) 21.47 ( 34.88%) 15.92
length=32773: 15.61 ( -2.41%) 21.05 ( 31.55%) 16.00
length=32779: 14.94 (-34.51%) 21.04 ( -7.74%) 22.81
length=32774: 15.02 ( -6.18%) 21.38 ( 33.56%) 16.00
length=32778: 14.95 ( -6.27%) 21.47 ( 34.67%) 15.94
length=32775: 15.00 ( -6.16%) 20.98 ( 31.23%) 15.99
length=32777: 14.95 (-24.12%) 21.05 ( 6.79%) 19.71
length=65536: 15.12 ( -6.19%) 21.50 ( 33.32%) 16.12
length=65552: 17.82 (-28.61%) 21.64 (-13.32%) 24.96
length=65537: 15.07 (-31.44%) 21.13 ( -3.87%) 21.98
length=65551: 15.01 (-39.84%) 21.13 (-15.33%) 24.96
length=65538: 16.31 ( 1.44%) 21.57 ( 34.16%) 16.08
length=65550: 15.04 (-10.11%) 21.64 ( 29.37%) 16.73
length=65539: 15.36 ( -4.41%) 21.13 ( 31.45%) 16.07
length=65549: 16.02 ( 0.26%) 21.20 ( 32.74%) 15.97
length=65540: 15.08 ( -6.20%) 21.49 ( 33.69%) 16.07
length=65548: 15.01 (-30.63%) 21.64 ( 0.05%) 21.63
length=65541: 15.06 ( -6.19%) 21.05 ( 31.15%) 16.05
length=65547: 15.03 ( -6.09%) 21.21 ( 32.49%) 16.01
length=65542: 15.05 ( -6.20%) 21.49 ( 33.90%) 16.05
length=65546: 18.31 ( 5.97%) 21.65 ( 25.27%) 17.28
length=65543: 15.03 ( -6.19%) 21.05 ( 31.34%) 16.02
length=65545: 15.03 ( -6.21%) 21.21 ( 32.29%) 16.03
<snip>
* sysdeps/aarch64/multiarch/Makefile (sysdep_routines): Add
memmove_falkor.
* sysdeps/aarch64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Likewise.
* sysdeps/aarch64/multiarch/memmove.c: Likewise.
* sysdeps/aarch64/multiarch/memmove_falkor.S: New file.
---
sysdeps/aarch64/multiarch/Makefile | 3 +-
sysdeps/aarch64/multiarch/ifunc-impl-list.c | 1 +
sysdeps/aarch64/multiarch/memmove.c | 7 +-
sysdeps/aarch64/multiarch/memmove_falkor.S | 232 ++++++++++++++++++++++++++++
4 files changed, 241 insertions(+), 2 deletions(-)
create mode 100644 sysdeps/aarch64/multiarch/memmove_falkor.S
@@ -1,3 +1,4 @@
ifeq ($(subdir),string)
-sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor
+sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
+ memmove_falkor
endif
@@ -44,6 +44,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
IFUNC_IMPL (i, name, memmove,
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
return i;
@@ -30,9 +30,14 @@ extern __typeof (__redirect_memmove) __libc_memmove;
extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
libc_ifunc (__libc_memmove,
- IS_THUNDERX (midr) ? __memmove_thunderx : __memmove_generic);
+ (IS_THUNDERX (midr)
+ ? __memmove_thunderx
+ : (IS_FALKOR (midr)
+ ? __memmove_falkor
+ : __memmove_generic)));
# undef memmove
strong_alias (__libc_memmove, memmove);
new file mode 100644
@@ -0,0 +1,232 @@
+/* Copyright (C) 2017 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions: ARMv8-a, AArch64, falkor, unaligned accesses. */
+
+#define dstin x0
+#define src x1
+#define count x2
+#define dstlen x3
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define A_hw w7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l src
+#define E_h count
+#define F_l srcend
+#define F_h dst
+#define tmp1 x14
+
+/* Alias with A_l and A_h to train the prefetcher. */
+#define Q_l x22
+#define Q_h x23
+
+/* RATIONALE:
+
+ The copy has 4 distinct parts:
+ * Small copies of 16 bytes and under
+ * Medium sized copies of 17-96 bytes
+ * Large copies where the source address is higher than the destination
+ (forward copies)
+ * Large copies where the destination address is higher than the source
+ (copy backward, or move).
+
+ We use only two registerpairs x6,x7 and x22,x23 for the copies and copy 32
+ bytes at a time to correctly train the hardware prefetcher for better
+ throughput. */
+ENTRY_ALIGN (__memmove_falkor, 6)
+
+ sub tmp1, dstin, src
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 96
+ ccmp tmp1, count, 2, hi
+ b.lo L(move_long)
+
+ cmp count, 16
+ b.ls L(copy16)
+ cmp count, 96
+ b.hi L(copy_long)
+
+ /* Medium copies: 17..96 bytes. */
+ sub tmp1, count, 1
+ ldp A_l, A_h, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldp D_l, D_h, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+1:
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
+ /* Small copies: 0..16 bytes. */
+L(copy16):
+ cmp count, 8
+ b.lo 1f
+ ldr A_l, [src]
+ ldr A_h, [srcend, -8]
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ ret
+ .p2align 4
+1:
+ /* 4-7 */
+ tbz count, 2, 1f
+ ldr A_lw, [src]
+ ldr A_hw, [srcend, -4]
+ str A_lw, [dstin]
+ str A_hw, [dstend, -4]
+ ret
+ .p2align 4
+1:
+ /* 2-3 */
+ tbz count, 1, 1f
+ ldrh A_lw, [src]
+ ldrh A_hw, [srcend, -2]
+ strh A_lw, [dstin]
+ strh A_hw, [dstend, -2]
+ ret
+ .p2align 4
+1:
+ /* 0-1 */
+ tbz count, 0, 1f
+ ldrb A_lw, [src]
+ strb A_lw, [dstin]
+1: ret
+
+ .p2align 4
+ /* Copy 64..96 bytes. Copy 64 bytes from the start and
+ 32 bytes from the end. */
+L(copy96):
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [src, 32]
+ ldp D_l, D_h, [src, 48]
+ ldp E_l, E_h, [srcend, -32]
+ ldp F_l, F_h, [srcend, -16]
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstin, 32]
+ stp D_l, D_h, [dstin, 48]
+ stp E_l, E_h, [dstend, -32]
+ stp F_l, F_h, [dstend, -16]
+ ret
+
+ /* Align SRC to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 32 bytes per iteration and prefetches one iteration ahead. */
+
+ .p2align 4
+L(copy_long):
+ sub count, count, 64 + 16 /* Test and readjust count. */
+ mov B_l, Q_l
+ mov B_h, Q_h
+ ldp A_l, A_h, [src]
+ and tmp1, src, 15
+ bic src, src, 15
+ sub dst, dstin, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+ ldp Q_l, Q_h, [src, 16]!
+ stp A_l, A_h, [dstin]
+ ldp A_l, A_h, [src, 16]!
+
+L(loop64):
+ subs count, count, 32
+ stp Q_l, Q_h, [dst, 16]
+ ldp Q_l, Q_h, [src, 16]!
+ stp A_l, A_h, [dst, 32]!
+ ldp A_l, A_h, [src, 16]!
+ b.hi L(loop64)
+
+ /* Write the last full set of 32 bytes. The remainder is at most 32
+ bytes, so it is safe to always copy 32 bytes from the end even if
+ there is just 1 byte left. */
+L(last64):
+ ldp C_l, C_h, [srcend, -32]
+ stp Q_l, Q_h, [dst, 16]
+ ldp Q_l, Q_h, [srcend, -16]
+ stp A_l, A_h, [dst, 32]
+ stp C_l, C_h, [dstend, -32]
+ stp Q_l, Q_h, [dstend, -16]
+ mov Q_l, B_l
+ mov Q_h, B_h
+ ret
+
+ .p2align 4
+L(move_long):
+ cbz tmp1, 3f
+
+ mov B_l, Q_l
+ mov B_h, Q_h
+
+ /* Align SRCEND to 16 byte alignment so that we don't cross cache line
+ boundaries on both loads and stores. There are at least 96 bytes
+ to copy, so copy 16 bytes unaligned and then align. The loop
+ copies 32 bytes per iteration and prefetches one iteration ahead. */
+
+ ldp A_l, A_h, [srcend, -16]
+ and tmp1, srcend, 15
+ sub srcend, srcend, tmp1
+ ldp Q_l, Q_h, [srcend, -16]!
+ stp A_l, A_h, [dstend, -16]
+ sub count, count, tmp1
+ ldp A_l, A_h, [srcend, -16]!
+ sub dstend, dstend, tmp1
+ sub count, count, 64
+
+1:
+ subs count, count, 32
+ stp Q_l, Q_h, [dstend, -16]
+ ldp Q_l, Q_h, [srcend, -16]!
+ stp A_l, A_h, [dstend, -32]!
+ ldp A_l, A_h, [srcend, -16]!
+ b.hi 1b
+
+ /* Write the last full set of 32 bytes. The remainder is at most 32
+ bytes, so it is safe to always copy 32 bytes from the start even if
+ there is just 1 byte left. */
+2:
+ ldp C_l, C_h, [src, 16]
+ stp Q_l, Q_h, [dstend, -16]
+ ldp Q_l, Q_h, [src]
+ stp A_l, A_h, [dstend, -32]
+ stp C_l, C_h, [dstin, 16]
+ stp Q_l, Q_h, [dstin]
+ mov Q_l, B_l
+ mov Q_h, B_h
+3: ret
+
+END (__memmove_falkor)
+libc_hidden_builtin_def (__memmove_falkor)