Revert "Improve the performance of MatrixTransposeVector Multiply." This reverts commit 6c835f81a7c4315518c0ed766b1eef511335bf0b. This CL introduced a subtle cancellation error which is causing some application tests to fail, and is therefore being reverted. Change-Id: Iaaa8f13a18f0a805b0e70a25ccd1e97efb21f330
diff --git a/internal/ceres/benchmarks/macbook-pro-2014-small_blas_gemv_benchmark.json b/internal/ceres/benchmarks/macbook-pro-2014-small_blas_gemv_benchmark.json index 1798ccb..3e483a2 100644 --- a/internal/ceres/benchmarks/macbook-pro-2014-small_blas_gemv_benchmark.json +++ b/internal/ceres/benchmarks/macbook-pro-2014-small_blas_gemv_benchmark.json
@@ -1,6 +1,6 @@ { "context": { - "date": "2018-02-25 13:34:34", + "date": "2018-02-25 13:20:04", "num_cpus": 8, "mhz_per_cpu": 2200, "cpu_scaling_enabled": false, @@ -9,506 +9,506 @@ "benchmarks": [ { "name": "BM_MatrixVectorMultiply/1/1", - "iterations": 70252205, - "real_time": 1.0236544348302301e+01, - "cpu_time": 1.0086487676792496e+01, + "iterations": 69298697, + "real_time": 1.0097105894512250e+01, + "cpu_time": 1.0040275360444367e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/1/2", - "iterations": 68107961, - "real_time": 1.0524348980780600e+01, - "cpu_time": 1.0394526419606072e+01, + "iterations": 65913992, + "real_time": 1.0302522610563768e+01, + "cpu_time": 1.0245351244998181e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/1/3", - "iterations": 69200732, - "real_time": 1.0096407926919197e+01, - "cpu_time": 9.9902844958345245e+00, + "iterations": 73595895, + "real_time": 9.6440729347083387e+00, + "cpu_time": 9.5853172245544389e+00, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/1/4", - "iterations": 63594738, - "real_time": 1.0774345118765158e+01, - "cpu_time": 1.0670930038268263e+01, + "iterations": 69574897, + "real_time": 1.0073530211782117e+01, + "cpu_time": 9.9997560901886722e+00, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/1/6", - "iterations": 58849740, - "real_time": 1.2862122686593422e+01, - "cpu_time": 1.2680090005495353e+01, + "iterations": 58273603, + "real_time": 1.1879482448448192e+01, + "cpu_time": 1.1746073089045133e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/1/7", - "iterations": 50270744, - "real_time": 1.4796897833024486e+01, - "cpu_time": 1.4365512473815775e+01, + "iterations": 54426846, + "real_time": 1.2970374490544540e+01, + "cpu_time": 1.2881400476522181e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/1/12", - "iterations": 38925219, - "real_time": 1.7628035668447385e+01, - "cpu_time": 1.7363447589080991e+01, + "iterations": 40754541, + "real_time": 1.7086407941937370e+01, + "cpu_time": 1.6969201051730625e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/1/16", - "iterations": 34818249, - "real_time": 2.0217151184202585e+01, - "cpu_time": 1.9996726429292874e+01, + "iterations": 36813042, + "real_time": 2.0977509004292560e+01, + "cpu_time": 2.0338009556504460e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/1/20", - "iterations": 31906359, - "real_time": 2.2445605654072409e+01, - "cpu_time": 2.2253933769127354e+01, + "iterations": 32061264, + "real_time": 2.1783493596539344e+01, + "cpu_time": 2.1625753744456222e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/2/1", - "iterations": 71029213, - "real_time": 9.9579144427668691e+00, - "cpu_time": 9.8622238711838168e+00, + "iterations": 71987577, + "real_time": 9.9393425620120528e+00, + "cpu_time": 9.8528944792793975e+00, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/2/2", - "iterations": 63346123, - "real_time": 1.1705572873750949e+01, - "cpu_time": 1.1623789509580556e+01, + "iterations": 63771443, + "real_time": 1.1381383373240160e+01, + "cpu_time": 1.1294600939169571e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/2/3", - "iterations": 51417658, - "real_time": 1.3784111481715327e+01, - "cpu_time": 1.3678199812212368e+01, + "iterations": 52469043, + "real_time": 1.5624869525055580e+01, + "cpu_time": 1.5158995752981452e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/2/4", - "iterations": 43867070, - "real_time": 1.6680275408505068e+01, - "cpu_time": 1.6505866473416127e+01, + "iterations": 45512470, + "real_time": 1.5249180278513519e+01, + "cpu_time": 1.4811259419671170e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/2/6", - "iterations": 38832366, - "real_time": 1.8493552852222152e+01, - "cpu_time": 1.8353143869729728e+01, + "iterations": 40479275, + "real_time": 1.7419527499445600e+01, + "cpu_time": 1.7267799386229168e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/2/7", - "iterations": 35359607, - "real_time": 2.1535896143019905e+01, - "cpu_time": 2.0674296521451733e+01, + "iterations": 35497677, + "real_time": 1.9656466957520109e+01, + "cpu_time": 1.9009525609239173e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/2/12", - "iterations": 29137044, - "real_time": 2.4302196854567779e+01, - "cpu_time": 2.4139888727216171e+01, + "iterations": 27042793, + "real_time": 2.4197042293219681e+01, + "cpu_time": 2.3855672008434897e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/2/16", - "iterations": 24110910, - "real_time": 2.9894188605096506e+01, - "cpu_time": 2.9568937879159293e+01, + "iterations": 24077820, + "real_time": 2.8851556123593411e+01, + "cpu_time": 2.8603793865059156e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/2/20", - "iterations": 20147885, - "real_time": 3.4507044086219317e+01, - "cpu_time": 3.3935571897496963e+01, + "iterations": 20977240, + "real_time": 3.2981163630488950e+01, + "cpu_time": 3.2740865814568508e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/3/1", - "iterations": 61213425, - "real_time": 1.1528627127493634e+01, - "cpu_time": 1.1442522616566544e+01, + "iterations": 61687596, + "real_time": 1.1764908379779497e+01, + "cpu_time": 1.1687698123298562e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/3/2", - "iterations": 45232787, - "real_time": 1.5206686116791806e+01, - "cpu_time": 1.5059783957154792e+01, + "iterations": 43755469, + "real_time": 1.6423369476207299e+01, + "cpu_time": 1.6309206970218952e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/3/3", - "iterations": 39082565, - "real_time": 1.7894326358792654e+01, - "cpu_time": 1.7703981302148417e+01, + "iterations": 37693381, + "real_time": 1.8031485450223030e+01, + "cpu_time": 1.7915983710774054e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/3/4", - "iterations": 33507894, - "real_time": 2.1085192464583656e+01, - "cpu_time": 2.0737620812576353e+01, + "iterations": 36812849, + "real_time": 1.9044860858008274e+01, + "cpu_time": 1.8915433575923469e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/3/6", - "iterations": 30657919, - "real_time": 2.3736662589079380e+01, - "cpu_time": 2.3452798606454710e+01, + "iterations": 31380003, + "real_time": 2.2716422938798129e+01, + "cpu_time": 2.2573452271499175e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/3/7", - "iterations": 29037209, - "real_time": 2.8086983741164396e+01, - "cpu_time": 2.6849377982574079e+01, + "iterations": 29979614, + "real_time": 2.5305456738143899e+01, + "cpu_time": 2.4368525892294780e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/3/12", - "iterations": 20780516, - "real_time": 3.7916595043916402e+01, - "cpu_time": 3.4962991294345251e+01, + "iterations": 22574601, + "real_time": 3.2935694633936649e+01, + "cpu_time": 3.1806276443158403e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/3/16", - "iterations": 14964235, - "real_time": 4.5643732738804459e+01, - "cpu_time": 4.4360169430645840e+01, + "iterations": 18534847, + "real_time": 3.7167092181413921e+01, + "cpu_time": 3.6952179858835549e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/3/20", - "iterations": 10665042, - "real_time": 6.0875662188190091e+01, - "cpu_time": 5.7067567103814788e+01, + "iterations": 14379061, + "real_time": 4.9144608746192276e+01, + "cpu_time": 4.8366649254773812e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/4/1", - "iterations": 37661946, - "real_time": 2.0775609523494111e+01, - "cpu_time": 1.7639077917004130e+01, + "iterations": 48585469, + "real_time": 1.4383808851738836e+01, + "cpu_time": 1.4028535980582999e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/4/2", - "iterations": 35503979, - "real_time": 1.9826045271850877e+01, - "cpu_time": 1.9558512019174053e+01, + "iterations": 36885397, + "real_time": 1.9225010159014570e+01, + "cpu_time": 1.9118216349955460e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/4/3", - "iterations": 31566368, - "real_time": 2.4285911482952638e+01, - "cpu_time": 2.4048379591849130e+01, + "iterations": 31222959, + "real_time": 2.2854289371385235e+01, + "cpu_time": 2.2695991113462323e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/4/4", - "iterations": 27748598, - "real_time": 2.8067616677809326e+01, - "cpu_time": 2.5865126591260609e+01, + "iterations": 31518186, + "real_time": 2.2779098836662452e+01, + "cpu_time": 2.2609264378349625e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/4/6", - "iterations": 22872005, - "real_time": 2.8774739072673267e+01, - "cpu_time": 2.8174005733209675e+01, + "iterations": 27870791, + "real_time": 2.6312030431589900e+01, + "cpu_time": 2.6164704116219813e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/4/7", - "iterations": 23207779, - "real_time": 3.0391250791985073e+01, - "cpu_time": 2.9916649930180775e+01, + "iterations": 26635110, + "real_time": 2.8015773282089135e+01, + "cpu_time": 2.7610248277555474e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/4/12", - "iterations": 15364422, - "real_time": 4.5653321419675805e+01, - "cpu_time": 4.2920586273925778e+01, + "iterations": 17797575, + "real_time": 3.9333237811242320e+01, + "cpu_time": 3.8665042849938892e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/4/16", - "iterations": 10000000, - "real_time": 5.1884067698847502e+01, - "cpu_time": 5.1231399999999638e+01, + "iterations": 14592636, + "real_time": 4.8254791185539325e+01, + "cpu_time": 4.7936644208764051e+01, "time_unit": "ns" }, { "name": "BM_MatrixVectorMultiply/4/20", - "iterations": 10321743, - "real_time": 7.1935641875272623e+01, - "cpu_time": 6.9165353177268500e+01, + "iterations": 11582884, + "real_time": 6.1053182527591900e+01, + "cpu_time": 6.0339031280982766e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/1/1", - "iterations": 57211511, - "real_time": 1.2786805141388754e+01, - "cpu_time": 1.2007094166766615e+01, + "iterations": 76783011, + "real_time": 9.3956197029191628e+00, + "cpu_time": 9.3391232078669315e+00, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/1/2", - "iterations": 59967446, - "real_time": 1.2595885590935398e+01, - "cpu_time": 1.2386837351719235e+01, + "iterations": 70815082, + "real_time": 9.6104494527203634e+00, + "cpu_time": 9.5503808072974952e+00, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/1/3", - "iterations": 48324853, - "real_time": 1.3965870500880389e+01, - "cpu_time": 1.3175870395301542e+01, + "iterations": 66582329, + "real_time": 1.0727912012247707e+01, + "cpu_time": 1.0640345728969692e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/1/4", - "iterations": 53931199, - "real_time": 1.2531495191006034e+01, - "cpu_time": 1.2395125871390313e+01, + "iterations": 53793602, + "real_time": 1.3614292048217301e+01, + "cpu_time": 1.3500583210620450e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/1/6", - "iterations": 46024169, - "real_time": 1.7573782894920971e+01, - "cpu_time": 1.6465957266930783e+01, + "iterations": 40321652, + "real_time": 1.7766203749914158e+01, + "cpu_time": 1.7570882264446855e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/1/7", - "iterations": 39807333, - "real_time": 1.7835137661432338e+01, - "cpu_time": 1.7324019170035765e+01, + "iterations": 37201986, + "real_time": 1.9978757341790001e+01, + "cpu_time": 1.8992776353391505e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/1/12", - "iterations": 39456626, - "real_time": 1.8121366004006191e+01, - "cpu_time": 1.7872739549499133e+01, + "iterations": 27434090, + "real_time": 2.6018652924849363e+01, + "cpu_time": 2.5589622254647360e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/1/16", - "iterations": 35995824, - "real_time": 1.9690264516458388e+01, - "cpu_time": 1.9365940893587972e+01, + "iterations": 18985007, + "real_time": 3.8675490929263397e+01, + "cpu_time": 3.8356793863705086e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/1/20", - "iterations": 28878603, - "real_time": 2.2967801144088135e+01, - "cpu_time": 2.1233817993204124e+01, + "iterations": 16250235, + "real_time": 4.2684055338167965e+01, + "cpu_time": 4.2342464585896622e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/2/1", - "iterations": 55120280, - "real_time": 1.3296489332225445e+01, - "cpu_time": 1.2641753634052581e+01, + "iterations": 72217809, + "real_time": 9.8704761588567429e+00, + "cpu_time": 9.7949108370208933e+00, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/2/2", - "iterations": 45745355, - "real_time": 1.5342622390806852e+01, - "cpu_time": 1.5143701475264400e+01, + "iterations": 58405854, + "real_time": 1.2449916972022011e+01, + "cpu_time": 1.2345851496324279e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/2/3", - "iterations": 36244078, - "real_time": 1.7288141497621908e+01, - "cpu_time": 1.7104835719644967e+01, + "iterations": 44559025, + "real_time": 1.6004563317294398e+01, + "cpu_time": 1.5868367855894519e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/2/4", - "iterations": 43622948, - "real_time": 1.7079777392449788e+01, - "cpu_time": 1.6899912403902590e+01, + "iterations": 35586284, + "real_time": 1.8873825462252668e+01, + "cpu_time": 1.8742417724761761e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/2/6", - "iterations": 34441361, - "real_time": 2.1433496777681615e+01, - "cpu_time": 2.1101808375110068e+01, + "iterations": 30603233, + "real_time": 2.3746124174457755e+01, + "cpu_time": 2.3614759917685756e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/2/7", - "iterations": 29266661, - "real_time": 2.8695185282827168e+01, - "cpu_time": 2.5598239580524783e+01, + "iterations": 25503512, + "real_time": 2.7041517616016900e+01, + "cpu_time": 2.6823050880208051e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/2/12", - "iterations": 28410129, - "real_time": 2.5414124132552242e+01, - "cpu_time": 2.5012734014689062e+01, + "iterations": 17142745, + "real_time": 4.0841419326141207e+01, + "cpu_time": 4.0593790551046581e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/2/16", - "iterations": 26304121, - "real_time": 2.7786573175356700e+01, - "cpu_time": 2.7248239924078774e+01, + "iterations": 13289037, + "real_time": 5.3266691859451711e+01, + "cpu_time": 5.2837914440301496e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/2/20", - "iterations": 23371507, - "real_time": 3.1584149964122386e+01, - "cpu_time": 3.1010794468666266e+01, + "iterations": 10179301, + "real_time": 7.1242744951979475e+01, + "cpu_time": 6.9943702421217523e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/3/1", - "iterations": 45903445, - "real_time": 1.5271870291696541e+01, - "cpu_time": 1.5023186168271270e+01, + "iterations": 69897251, + "real_time": 1.0455159873353184e+01, + "cpu_time": 1.0183404780826111e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/3/2", - "iterations": 38719384, - "real_time": 1.8626934482269160e+01, - "cpu_time": 1.8326117998158196e+01, + "iterations": 48687523, + "real_time": 1.4473279406174216e+01, + "cpu_time": 1.4368362917127623e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/3/3", - "iterations": 32912833, - "real_time": 2.1152548974519810e+01, - "cpu_time": 2.0909078230974622e+01, + "iterations": 36199282, + "real_time": 1.8768082416320095e+01, + "cpu_time": 1.8594457204980039e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/3/4", - "iterations": 36580841, - "real_time": 2.2141558800840450e+01, - "cpu_time": 2.1718254099188123e+01, + "iterations": 32392711, + "real_time": 2.1793111421538484e+01, + "cpu_time": 2.1554787433506387e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/3/6", - "iterations": 27507181, - "real_time": 2.5637143914685403e+01, - "cpu_time": 2.5370575050929215e+01, + "iterations": 25449918, + "real_time": 2.7307808579096385e+01, + "cpu_time": 2.7092071573668875e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/3/7", - "iterations": 25641026, - "real_time": 2.9056108440940619e+01, - "cpu_time": 2.8340402603234452e+01, + "iterations": 22970703, + "real_time": 3.0684889528257191e+01, + "cpu_time": 3.0455576392242047e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/3/12", - "iterations": 21857104, - "real_time": 2.9318980271296027e+01, - "cpu_time": 2.8576201128932929e+01, + "iterations": 15189357, + "real_time": 4.8938380408733977e+01, + "cpu_time": 4.7137676729830986e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/3/16", - "iterations": 21147525, - "real_time": 3.4772101701348014e+01, - "cpu_time": 3.3845851937756258e+01, + "iterations": 12187690, + "real_time": 6.4202213625161093e+01, + "cpu_time": 6.1960141749584857e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/3/20", - "iterations": 18255885, - "real_time": 3.8151240495870312e+01, - "cpu_time": 3.7737693899802757e+01, + "iterations": 8626958, + "real_time": 7.7660120865570065e+01, + "cpu_time": 7.6987740058547374e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/4/1", - "iterations": 41598327, - "real_time": 1.7972290135515205e+01, - "cpu_time": 1.7753334166539762e+01, + "iterations": 68685362, + "real_time": 1.0764260905433320e+01, + "cpu_time": 1.0544197175520464e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/4/2", - "iterations": 34202734, - "real_time": 2.2089604240919805e+01, - "cpu_time": 2.1759722483003841e+01, + "iterations": 41396113, + "real_time": 1.6881927031100783e+01, + "cpu_time": 1.6629145833088174e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/4/3", - "iterations": 27249123, - "real_time": 2.6488586586931632e+01, - "cpu_time": 2.6050122787437939e+01, + "iterations": 35592798, + "real_time": 2.0520861720163161e+01, + "cpu_time": 2.0318998242284707e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/4/4", - "iterations": 30863381, - "real_time": 2.4542076224741049e+01, - "cpu_time": 2.4055303597489978e+01, + "iterations": 29245508, + "real_time": 2.4356611964112989e+01, + "cpu_time": 2.4129585986333215e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/4/6", - "iterations": 23214321, - "real_time": 3.1091653333839805e+01, - "cpu_time": 3.0791294735693508e+01, + "iterations": 20036065, + "real_time": 3.4106466514085753e+01, + "cpu_time": 3.3619725230478345e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/4/7", - "iterations": 21214950, - "real_time": 3.3605207788356317e+01, - "cpu_time": 3.3319333771703114e+01, + "iterations": 18768417, + "real_time": 3.6661212077921938e+01, + "cpu_time": 3.6314090847405801e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/4/12", - "iterations": 21109771, - "real_time": 3.4236321416005268e+01, - "cpu_time": 3.3933669863117345e+01, + "iterations": 11789871, + "real_time": 6.1986798330774114e+01, + "cpu_time": 6.1171322400389727e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/4/16", - "iterations": 17775521, - "real_time": 3.9990263516471948e+01, - "cpu_time": 3.9692563722886177e+01, + "iterations": 7869236, + "real_time": 8.5527959635890710e+01, + "cpu_time": 8.2749964545479799e+01, "time_unit": "ns" }, { "name": "BM_MatrixTransposeVectorMultiply/4/20", - "iterations": 14255226, - "real_time": 5.0482404275173749e+01, - "cpu_time": 4.9507808574903912e+01, + "iterations": 7417847, + "real_time": 9.9073482499474821e+01, + "cpu_time": 9.7701125407411183e+01, "time_unit": "ns" } ]
diff --git a/internal/ceres/small_blas.h b/internal/ceres/small_blas.h index 9e15b5e..2d050d3 100644 --- a/internal/ceres/small_blas.h +++ b/internal/ceres/small_blas.h
@@ -309,8 +309,8 @@ for (int row = 0; row < NUM_ROW_A; ++row) { double tmp = 0.0; - for (int col = 0; col < NUM_COL_A; ++col, ++A) { - tmp += (*A) * b[col]; + for (int col = 0; col < NUM_COL_A; ++col) { + tmp += A[row * NUM_COL_A + col] * b[col]; } if (kOperation > 0) { @@ -358,19 +358,18 @@ const int NUM_ROW_A = (kRowA != Eigen::Dynamic ? kRowA : num_row_a); const int NUM_COL_A = (kColA != Eigen::Dynamic ? kColA : num_col_a); + for (int row = 0; row < NUM_COL_A; ++row) { + double tmp = 0.0; + for (int col = 0; col < NUM_ROW_A; ++col) { + tmp += A[col * NUM_COL_A + row] * b[col]; + } - if (kOperation == 0) { - std::fill(c, c + NUM_COL_A, 0.0); - } - - for (int row = 0; row < NUM_ROW_A; ++row) { - const double tmp = b[row]; - for (int col = 0; col < NUM_COL_A; ++col, ++A) { - if (kOperation >= 0) { - c[col] += (*A) * tmp; - } else { - c[col] -= (*A) * tmp; - } + if (kOperation > 0) { + c[row] += tmp; + } else if (kOperation < 0) { + c[row] -= tmp; + } else { + c[row] = tmp; } } #endif // CERES_NO_CUSTOM_BLAS