Hi,
I just relate the improvement I got by chance, working with cascadeclassifier_gpu.cpp (v 2.4.10)
I tried  this file with a webcam video stream and one face picture in front of the cam.
I was disappointed by the performance of my new gpu_2 against gpu_1. Gpu_1 computes at 5.5 fps, gpu_2 at 7.5 fps.
In order to simplify the main code I wrote a function dealing with gpu code only.
The result was astonishing, gpu_2 computes 10 x faster in average, between 40 and 100 fps average 70, 
same for gpu_1 between 30 and 40 fps, average 35.
I precise that the high speed computing appears only when there is a detection, without the speed slow down at 7.5  for gpu 2, 5.5  for gpu_2
What is going on ? does anybody have an idea ? 
Regards
Linux  3.19.8-100.fc20.x86_64 #1 SMP 
nvidia driver 340.76
GPU_1   geforce 9500 GT 500M    32 cores
GPU_2   geforce GT 720 1024M  192 cores
 ******* part of the original code without the use of a function:
        (image.empty() ? frame : image).copyTo(frame_cpu);
        frame_gpu.upload(image.empty() ? frame : image);
        convertAndResize(frame_gpu, gray_gpu, resized_gpu, scaleFactor);
        convertAndResize(frame_cpu, gray_cpu, resized_cpu, scaleFactor);
        TickMeter tm;
        tm.start();
 	cascade_gpu.findLargestObject = findLargestObject;
	 detections_num = cascade_gpu.detectMultiScale(resized_gpu, facesBuf_gpu, 1.2,
                                                          (filterRects || findLargestObject) ? 4 : 0);
        facesBuf_gpu.colRange(0, detections_num).download(faces_downloaded);
        resized_gpu.download(resized_cpu);
        for (int i = 0; i< detections_num; ++i)
        {
                rectangle(resized_cpu, faces_downloaded.ptr()[i], Scalar(255));
        }
        tm.stop();
        double detectionTime = tm.getTimeMilli();
        double fps = 1000 / detectionTime;
***** the function created
static int getTargets (Mat &frame, CascadeClassifier_GPU &cascade_gpu, double scaleFactor, Rect* &faceRects, Mat &resized_cpu)
{
 	int detections_num=0;
 	GpuMat facesBuf_gpu, frame_gpu, gray_gpu, resized_gpu;
 	Mat    faces_downloaded;
        frame_gpu.upload( frame );
        convertAndResize(frame_gpu, gray_gpu, resized_gpu, scaleFactor);
	
 	cascade_gpu.findLargestObject = true;
	
  	detections_num = cascade_gpu.detectMultiScale(resized_gpu, facesBuf_gpu, 1.2, 4);
        facesBuf_gpu.colRange(0, detections_num).download(faces_downloaded);
	
 	 resized_gpu.download(resized_cpu);
	
  	if( detections_num > 0)
        {
	      faceRects = faces_downloaded.ptr();
 	}
	 return detections_num;
}
*****  main code modified 
        (image.empty() ? frame : image).copyTo(frame_cpu);
        convertAndResize(frame_cpu, gray_cpu, resized_cpu, scaleFactor);
        TickMeter tm;
        tm.start();
	     
	 Rect *faceRects ;
	 detections_num = getTargets(frame, cascade_gpu, scaleFactor, faceRects, resized_cpu);  
	
        for (int i = 0; i < detections_num; ++i)
        {
		rectangle(resized_cpu, faceRects[i], Scalar(255));
        }
        tm.stop();  
                       
                           
                       
                     ↧