drivers/vision/vpu/resource/hwmapper/vpul-hwmapper.c - LeafOS-Devices/android_kernel_samsung_universal7904 - Gitiles

 /*
  * Samsung Exynos SoC series VPU driver
  *
  * Copyright (c) 2016 Samsung Electronics Co., Ltd
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */

 #include <linux/bitmap.h>

 #include "lib/vpul-def.h"
 #include "lib/vpul-errno.h"
 #include "lib/vpul-gen.h"
 #include "lib/vpul-ds.h"
 #include "lib/vpul-hwmapper.h"
 #include "vpul-latency-balancing.h"
 #include "lib/vpul-translator.h"
 #include "lib/vpul-pu.h"
 #include "vpu-hardware.h"
 #include "lib/vpul-hw-v2.1.h"
 #include "lib/vpu-fwif-hw-gen.h"

 struct port_index_2_pu {
 	struct	vpul_pu	*pu;
 	__u32	port_index_in_pu;
 };

 /**
  * The following structure is used for returning the results of calculation
  * of number of MPRBs needed.
  * in addition to number of large MPRBs and small MPRBs, it also returns an
  * array describing the PU RAM ports to use for MPRB connections; this is
  * useful for those exception cases to the rule of using consecutive ports,
  * as for fast disparity, upscaler and inpaint.
  * This structure is initialized by calling function to default values :
  * - num_lrg_mprbs and num_sm_mprbs both initialized to 0
  * - ram_ports_array = consecutive_mem_ports[]
  * the functions calculating number of MPRBs needed and RAM ports to use need
  * to write in struct result_of_calc_num_mprbs_needed only the data which is
  * different from default values; for example, if no small MPRBs are needed, no
  * need to write 0 to num_sm_mprbs.
  */
 struct result_of_calc_num_mprbs_needed {
 	__u32	num_lrg_mprbs;
 	__u32	num_sm_mprbs;
 	const __u32 *ram_ports_array;
 };

 typedef void (*__calc_nbr_of_mprbs_needed)(const struct vpul_pu *pu,
 	__u32 width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result);

 #include "lib/mem_inter_connect.def"

 static const struct pu_ram_port_range pu_inst_2_ram_port[VPU_PU_NUMBER] = {
 #define VPU_PU_INSTANCE(a, b, c, d, e, f, g, h, i) {f, g},
 #include "lib/vpul_pu_instances.def"
 #undef VPU_PU_INSTANCE
 };

 const enum VPU_PU_TYPES pu_inst2type[VPU_PU_NUMBER + 1] = {
 #define VPU_PU_INSTANCE(a, b, c, d, e, f, g, h,i) b,
 #include "lib/vpul_pu_instances.def"
 	/* indicates "end of array" */
 	END_OF_PU_INST2TYPE_TRANSLATOR
 #undef VPU_PU_INSTANCE
 };

 /**
  * the following arrays specify patterns of PU RAM ports to be used for
  * MPRB connections
  */
 const __u32 upscaler_2_mem_ports[] = {0, 2};

 const __u32 inpaint_4_mem_ports[] = {0, 2, 4, 6};

 const __u32 fast_disp__mem_ports_width_511_or_less[] = {0, 1, 2, 3, 4, 5, 6,
 							7, 8, 9, 10, 11, 12,
 							20, 22, 24, 26, 28,
 							30, 32, 34, 37, 40};

 const __u32 fast_disp__mem_ports_width_512_to_1023[] = {0, 1, 2, 3, 4, 5, 6,
 							7, 8, 9, 10, 11, 12,
 							20, 21, 22, 23, 24,
 							25, 26, 27, 28, 29,
 							30, 31, 32, 33, 34,
 							35, 37, 38, 40, 41};

 const __u32 fast_disp__mem_ports_width_1024_or_more[] = {0, 1, 2, 3, 4, 5, 6,
 							7, 8, 9, 10, 11, 12,
 							13, 14, 15, 16, 17, 18,
 							19, 34, 35, 36, 37, 38,
 							39, 40, 41, 42};

 const __u32 integr_img_mem_ports_width_up_to_1024[] = {0, 4, 5, 6, 7, 8, 9, 10,
 					11, 12, 13, 14, 15, 16, 17, 18, 19};

 const __u32 integr_img_mem_ports_width_1025_to_2048[] = {0, 1, 4, 5, 6, 7, 8,
 							9, 10, 11, 12, 13, 14,
 							15, 16, 17, 18, 19};

 const __u32 consecutive_mem_ports[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
 					10, 11, 12, 13, 14, 15, 16, 17,
 					18, 19, 20, 21, 22, 23, 24, 25,
 					26, 27, 28, 29, 30, 31, 32, 33,
 					34, 35, 36, 37, 38, 39, 40, 41,
 					42, 43, 44, 45, 46};

 /**
  * **************************************************************
  * following declarations are for filters (linear and non linear)
  * **************************************************************
  */
 #define NUM_OF_WIDTH_RANGES_FOR_FILTERS_MEM_REQUIR	5
 #define NUM_OF_FILTER_SIZES_FOR_FILTERS_MEM_REQUIR	6

 struct mprb_requirements {
 	__u32 mprb_num_lrg;
 	__u32 mprb_num_sm;
 };

 struct mprb_req_per_pixel_size {
 	struct mprb_requirements mprb_req_for_8bit_pixel;
 	struct mprb_requirements mprb_req_for_16bit_pixel;
 	struct mprb_requirements mprb_req_for_32bit_pixel;
 };

 const __u32 width_sizes_2_index[NUM_OF_WIDTH_RANGES_FOR_FILTERS_MEM_REQUIR] = {
 						257, 513, 1025, 2049, 4097};
 /**
  * the following LUT provides the number of MPRBs used for filters, according
  * to width and filter size
  * The contents of each entry are pairs of values :
  * - 1st value = number of large MPRBs needed
  * - 2nd value = number of small MPRBs needed
  * None of the pairs has its 2 values both different from 0
  */
 const struct mprb_req_per_pixel_size mprb_req_per_pixel_size_values
 			[NUM_OF_WIDTH_RANGES_FOR_FILTERS_MEM_REQUIR]
 			[NUM_OF_FILTER_SIZES_FOR_FILTERS_MEM_REQUIR] = {

 			/* 1 byte/pixel  -  2 bytes/pixel  -  4 bytes/pixel */

 			/* width 1 to 256, filter size 1x1 */
 			{
 				{{0, 0},	{0, 0},		{0, 0} },
 			/* width 1 to 64, filter size 3x3 */
 				{{0, 1},	{0, 1},		{0, 2} },
 			/* width 1 to 64, filter size 5x5 */
 				{{0, 1},	{0, 2},		{0, 4} },
 			/* width 1 to 64, filter size 7x7 */
 				{{0, 4},	{0, 4},		{0, 6} },
 			/* width 1 to 64, filter size 9x9 */
 				{{0, 2},	{0, 4},		{0, 0} },
 			/* width 1 to 64, filter size 11x11 */
 				{{0, 4},	{0, 6},		{0, 0} },
 			}, {
 			/* width 257 to 512, filter size 1x1 */
 				{{0, 0},	{0, 0},		{0, 0} },
 			/* width 257 to 512, filter size 3x3 */
 				{{0, 1},	{0, 2},		{2, 0} },
 			/* width 257 to 512, filter size 5x5 */
 				{{0, 2},	{0, 4},		{4, 0} },
 			/* width 257 to 512, filter size 7x7 */
 				{{0, 4},	{0, 4},		{6, 0} },
 			/* width 257 to 512, filter size 9x9 */
 				{{0, 4},	{4, 0},		{0, 0} },
 			/* width 257 to 512, filter size 11x11 */
 				{{4, 0},	{0, 6},		{0, 0} },
 			}, {
 			/* width 513 to 1024, filter size 1x1 */
 				{{0, 0},	{0, 0},		{0, 0} },
 			/* width 513 to 1024, filter size 3x3 */
 				{{0, 2},	{1, 0},		{2, 0} },
 			/* width 513 to 1024, filter size 5x5 */
 				{{1, 0},	{2, 0},		{4, 0} },
 			/* width 513 to 1024, filter size 7x7 */
 				{{4, 0},	{4, 0},		{6, 0} },
 			/* width 513 to 1024, filter size 9x9 */
 				{{2, 0},	{4, 0},		{0, 0} },
 			/* width 513 to 1024, filter size 11x11 */
 				{{4, 0},	{6, 0},		{0, 0} },
 			}, {
 			/* width 1025 to 2048, filter size 1x1 */
 				{{0, 0},	{0, 0},		{0, 0} },
 			/* width 1025 to 2048, filter size 3x3 */
 				{{1, 0},	{2, 0},		{4, 0} },
 			/* width 1025 to 2048, filter size 5x5 */
 				{{2, 0},	{4, 0},		{8, 0} },
 			/* width 1025 to 2048, filter size 7x7 */
 				{{4, 0},	{8, 0},		{12, 0} },
 			/* width 1025 to 2048, filter size 9x9 */
 				{{4, 0},	{8, 0},		{0, 0} },
 			/* width 1025 to 2048, filter size 11x11 */
 				{{8, 0},	{12, 0},	{0, 0} },
 			}, {
 			/* width 2049 to 4096, filter size 1x1 */
 				{{0, 0},	{0, 0},		{0, 0} },
 			/* width 2049 to 4096, filter size 3x3 */
 				{{2, 0},	{4, 0},		{8, 0} },
 			/* width 2049 to 4096, filter size 5x5 */
 				{{4, 0},	{8, 0},		{16, 0} },
 			/* width 2049 to 4096, filter size 7x7 */
 				{{8, 0},	{12, 0},	{24, 0} },
 			/* width 2049 to 4096, filter size 9x9 */
 				{{8, 0},	{16, 0},	{0, 0} },
 			/* width 2049 to 4096, filter size 11x11 */
 				{{16, 0},	{24, 0},	{0, 0} }
 			}
 };

 /**
  * ***********************************************
  * following declarations are for non-filter PUs
  * ***********************************************
  */
 struct mprb_needed_per_img_width {
 				__u32 width_upper_limit;
 				__u32 nbr_lrg_mprbs;
 				__u32 nbr_sm_mprbs;
 				const __u32 *ram_ports_array;
 };

 const struct mprb_needed_per_img_width mprb_per_img_width_upsc[] = {
 				{1025,       2, 0, upscaler_2_mem_ports},
 				{0xFFFFFFFF, 4, 0, consecutive_mem_ports}
 };

 const struct mprb_needed_per_img_width mprb_per_img_width_downsc[] = {
 				{1025,       1, 0, consecutive_mem_ports},
 				{2049,       2, 0, consecutive_mem_ports},
 				{0xFFFFFFFF, 4, 0, consecutive_mem_ports}
 };

 const struct mprb_needed_per_img_width mprb_per_img_width_integ_II_or_CC[] = {
 		{1025,       1, 0, integr_img_mem_ports_width_up_to_1024},
 		{2049,       2, 0, integr_img_mem_ports_width_1025_to_2048},
 		{0xFFFFFFFF, 4, 0, consecutive_mem_ports}
 };

 const struct mprb_needed_per_img_width mprb_per_img_width_inpnt[] = {
 				{621,        4, 0, inpaint_4_mem_ports},
 				{0xFFFFFFFF, 8, 0, consecutive_mem_ports}
 };

 const struct mprb_needed_per_img_width mprb_per_img_width_fdepth[] = {
 		{512,        13, 10, fast_disp__mem_ports_width_511_or_less},
 		{1024,       13, 20, fast_disp__mem_ports_width_512_to_1023},
 		{0xFFFFFFFF, 20, 9,  fast_disp__mem_ports_width_1024_or_more}
 };

 static __s32 set_size_for_dma(
 		const struct vpul_task *task,
 		const struct vpul_vertex *vertex,
 		const struct vpul_pu *pu,
 		__u32 *actual_sizes_array)
 {
 	__u32 inout_type;
 	__u32 memmap_index;
 	__u32 roi_idx;
 	const union vpul_pu_parameters *pu_params;
 	const struct vpul_pu_dma *pu_dma_in_params;
 	const struct vpul_image_size_desc *size_desc;
 	const struct vpul_process *process;
 	__u32 in_size_index;

 	in_size_index = pu->in_size_idx;
 	pu_params = &pu->params;
 	pu_dma_in_params = &pu_params->dma;
 	inout_type = pu_dma_in_params->inout_index;
 	process = &vertex->proc;
 	roi_idx = process->io.inout_types[inout_type].roi_index;
 	memmap_index = process->io.fixed_map_roi[roi_idx].memmap_idx;
 	size_desc =
 		&task->memmap_desc[memmap_index].image_sizes;
 	if (process->io.sizes[in_size_index].type != VPUL_SIZEOP_INOUT){
 		if (is_pu_dma_in(pu))
 			return -1; /* err  */
 	    else
 			return 0; /* dma out case, just do not update */
 	}
 	actual_sizes_array[in_size_index] = size_desc->width;
 	return 0;
 }

 static __s32 set_sizes_for_all_dma_pus(const struct vpul_task *task,
 				const struct vpul_vertex *vertex,
 				__u32 *actual_sizes_array)
 {
 	__u32 num_of_pus;
 	const struct vpul_pu *curr_pu;
 	__u32 i, j, subchain_count;
 	const struct vpul_subchain *subchain;

 	subchain_count = vertex->num_of_subchains;
 	subchain = fst_vtx_sc_ptr(task, vertex);

 	for (i = 0; i < subchain_count; i++, subchain++) {
 		num_of_pus = subchain->num_of_pus;
 		curr_pu = fst_sc_pu_ptr(task, subchain);

 		if ((num_of_pus != 0) && (curr_pu != NULL)) {
 			for (j = 0; j < num_of_pus; j++, curr_pu++) {
 				if (is_pu_dma_in(curr_pu) || is_pu_dma_out(curr_pu))  // condition should check dma in OR out is_pu_dma_in(curr_pu) || is_pu_dma_out(curr_pu)
 					if (set_size_for_dma(
 							task,
 							vertex,
 							curr_pu,
 							actual_sizes_array))
 							return -1;
 			}
 		}
 	}
 	return 0;
 }

 static __s32 set_3dnn_data_inout_sizes(const struct vpul_task *task,
 					const struct vpul_3dnn_process_base *proc3dnn_base,
 					__u32 *actual_sizes_array)
 {
 	__u32 i;
 	__u32 num_layers = proc3dnn_base->number_of_layers;
 	const struct vpul_3dnn_size *size_op;
 	__u32 largest_width = 0;
 	__u32 num_operations = proc3dnn_base->io.n_sizes_op;

 	for (i = 0; i < num_layers; i++) {
 		if (proc3dnn_base->layers[i].dim_size_input.x > largest_width)
 			largest_width = proc3dnn_base->layers[i].dim_size_input.x;
 	}

 	for (i = 0; i < num_operations; i++) {
 		size_op = &proc3dnn_base->io.sizes_3dnn[i];
 		if ((size_op->type == VPUL_3DXY_SIZEOP_INOUT) &&
 			(size_op->inout_3dnn_type == VPUL_IO_3DNN_INPUT))
 			actual_sizes_array[i] = largest_width;
 	}
 	return 0;
 }

 static __u32 set_size_crop(const struct vpul_sizes *size_op,
 				const struct vpul_process *process,
 				__u32 *actual_sizes_array,
 				__u32 index)
 {
 	__u32 previous_size;
 	const struct vpul_croppers *crop_params;

 	previous_size = actual_sizes_array[size_op->src_idx];
 	if (previous_size == 0xFFFFFFFF)
 		return 0;
 	crop_params = &process->io.croppers[size_op->op_ind];
 	actual_sizes_array[index] = previous_size - crop_params->Left - crop_params->Right;
 	return 1;
 }

 static __u32 set_size_scale(const struct vpul_sizes *size_op,
 				const struct vpul_process *process,
 				__u32 *actual_sizes_array,
 				__u32 index)
 {
 	__u32 previous_size;
 	const struct vpul_scales *scale_params;
 	__u32 temp_val;

 	previous_size = actual_sizes_array[size_op->src_idx];
 	if (previous_size == 0xFFFFFFFF)
 		return 0;
 	scale_params = &process->io.scales[size_op->op_ind];
 	temp_val = previous_size * scale_params->horizontal.numerator;
 	temp_val += scale_params->horizontal.denominator - 1;
 	actual_sizes_array[index] = temp_val / scale_params->horizontal.denominator;
 	return 1;
 }

 static __s32 set_actual_sizes(const struct vpul_task *task,
 				const struct vpul_vertex *vertex,
 				__u32 *actual_sizes_array)
 {
 	__u32 i;
 	const struct vpul_process *process = &vertex->proc;
 	__u32 num_operations = process->io.n_sizes_op;
 	const struct vpul_sizes *size_op;
 	__u32 sizes_were_calculated_flag;
 	__u32 sizes_left_to_calc_flag;

 	/* initialize all entries in array to "not calculated yet" */
 	for (i = 0; i < num_operations; i++)
 		actual_sizes_array[i] = 0xFFFFFFFF;

 	/* set actual sizes for all DMA-in PUs (should be of type "inout") */
 	if (set_sizes_for_all_dma_pus(task, vertex, actual_sizes_array))
 		return -1;

 	sizes_left_to_calc_flag = 1;

 	while (sizes_left_to_calc_flag) {
 		sizes_were_calculated_flag = 0;
 		sizes_left_to_calc_flag = 0;
 		for (i = 0; i < num_operations; i++) {
 			if (actual_sizes_array[i] == 0xFFFFFFFF) {
 				size_op = &process->io.sizes[i];
 				switch (size_op->type) {
 				case VPUL_SIZEOP_INOUT:
 					return -1;
 				case VPUL_SIZEOP_FIX:
 					/* TBD */
 					break;
 				case VPUL_SIZEOP_FORCE_CROP:
 				case VPUL_SIZEOP_CROP_ON_EDGES_TILES:
 					if (set_size_crop(size_op, process,
 							actual_sizes_array, i))
 						sizes_were_calculated_flag = 1;
 					else
 						sizes_left_to_calc_flag = 1;
 					break;
 				case VPUL_SIZEOP_SCALE:
 					if (set_size_scale(size_op, process,
 							actual_sizes_array, i))
 						sizes_were_calculated_flag = 1;
 					else
 						sizes_left_to_calc_flag = 1;
 					break;
 				default:
 					return -1;
 				}
 			}
 		}
 		if ((sizes_left_to_calc_flag) && (!sizes_were_calculated_flag))
 			return -1;
 	}
 	return 0;
 }

 static __u32 set_size_3dnn_crop(const struct vpul_3dnn_size *size_op_3dnn,
 				const struct vpul_3dnn_process_base *proc_3dnn_base,
 				__u32 *actual_sizes_array,
 				__u32 index)
 {
 	const struct vpul_croppers *crop_params;
 	__u32 previous_size;
 	__u32 i;
 	__u32 crop_val;
 	__u32 smallest_crop_val;

 	previous_size = actual_sizes_array[size_op_3dnn->src_idx];
 	if (previous_size == 0xFFFFFFFF)
 		return 0;

 	/* find smallest crop size --> largest remaining size
 	 * ("worst case" for MPRBs allocation)
 	 */
 	crop_params = &proc_3dnn_base->layers[0].croppers[size_op_3dnn->op_ind];
 	smallest_crop_val = crop_params->Left + crop_params->Right;

 	for (i = 0; i < proc_3dnn_base->number_of_layers; i++) {
 		crop_params = &proc_3dnn_base->layers[i].croppers[size_op_3dnn->op_ind];
 		crop_val = crop_params->Left + crop_params->Right;
 		if (crop_val < smallest_crop_val)
 			smallest_crop_val = crop_val;
 	}

 	actual_sizes_array[index] = previous_size - smallest_crop_val;
 	return 1;
 }

 static __u32 set_size_3dnn_scale(const struct vpul_3dnn_size *size_op_3dnn,
 				const struct vpul_3dnn_process_base *proc_3dnn_base,
 				__u32 *actual_sizes_array,
 				__u32 index)
 {
 	__u32 previous_size, temp_val, i;
 	const struct vpul_ratio *scale_val;
 	const struct vpul_ratio *largest_scale_val;

 	previous_size = actual_sizes_array[size_op_3dnn->src_idx];
 	if (previous_size == 0xFFFFFFFF)
 		return 0;
 	/* find largest scale size --> "worst case" for MPRBs allocation */
 	largest_scale_val = &proc_3dnn_base->layers[0].scales[size_op_3dnn->op_ind].horizontal;

 	for (i = 0; i < proc_3dnn_base->number_of_layers; i++) {
 		scale_val = &proc_3dnn_base->layers[i].scales[size_op_3dnn->op_ind].horizontal;
 		if ((scale_val->numerator * largest_scale_val->denominator) >
 			(scale_val->denominator * largest_scale_val->numerator)) {
 			largest_scale_val = scale_val;
 		}
 	}

 	temp_val = previous_size * largest_scale_val->numerator;
 	temp_val += largest_scale_val->denominator - 1;
 	actual_sizes_array[index] = temp_val / largest_scale_val->denominator;
 	return 1;
 }

 static __s32 set_actual_sizes_3dnn(const struct vpul_task *task,
 				const struct vpul_3dnn_process_base *proc_3dnn_base,
 				__u32 *actual_sizes_array)
 {
 	__u32 i;
 	const struct vpul_3dnn_size *size_op_3dnn;
 	__u32 sizes_were_calculated_flag;
 	__u32 sizes_left_to_calc_flag;
 	__u32 num_operations = proc_3dnn_base->io.n_sizes_op;

 	/* initialize all entries in array to "not calculated yet" */
 	for (i = 0; i < num_operations; i++)
 		actual_sizes_array[i] = 0xFFFFFFFF;

 	/* set actual sizes for all DMA-in PUs for data input (should be of type "inout") */
 	if (set_3dnn_data_inout_sizes(task, proc_3dnn_base, actual_sizes_array))
 		return -1;

 	sizes_left_to_calc_flag = 1;

 	while (sizes_left_to_calc_flag) {
 		sizes_were_calculated_flag = 0;
 		sizes_left_to_calc_flag = 0;
 		for (i = 0; i < num_operations; i++) {
 			if (actual_sizes_array[i] == 0xFFFFFFFF) {
 				size_op_3dnn = &proc_3dnn_base->io.sizes_3dnn[i];
 				if (size_op_3dnn->type == VPUL_3DXY_SIZEOP_CROP) {
 					if (set_size_3dnn_crop(size_op_3dnn,
 								proc_3dnn_base,
 								actual_sizes_array,
 								i))
 						sizes_were_calculated_flag = 1;
 					else
 						sizes_left_to_calc_flag = 1;
 				} else if (size_op_3dnn->type == VPUL_3DXY_SIZEOP_SCALE) {
 					if (set_size_3dnn_scale(size_op_3dnn,
 								proc_3dnn_base,
 								actual_sizes_array,
 								i))
 						sizes_were_calculated_flag = 1;
 					else
 						sizes_left_to_calc_flag = 1;
 				}
 			}
 		}
 		if ((sizes_left_to_calc_flag) && (!sizes_were_calculated_flag))
 			return -1;
 	}
 	return 0;
 }

 static __u32 get_width_index_for_filter_mem_req(__u32 width)
 {
 	__u32 i;
 	__u32 retval = 0xFF;

 	for (i = 0; i < sizeof(width_sizes_2_index) / sizeof(__u32); i++) {
 		if (width < width_sizes_2_index[i]) {
 			retval = i;
 			break;
 		}
 	}
 	return retval;
 }

 /**
  * this function returns index in range 0 to 5, according to
  * VPUH_FILTER_SIZE values - or 0xFF in case of invalid input
  */
 __u32 get_filter_size_index_for_linear_filter(__u32 filter_size)
 {
 	__u32 retval = 0xFF;

 	switch (filter_size) {
 	case VPUH_FILTER_SIZE_1x1:
 		retval = 0;
 		break;
 	case VPUH_FILTER_SIZE_3x3:
 		retval = 1;
 		break;
 	case VPUH_FILTER_SIZE_5x5:
 		retval = 2;
 		break;
 	case VPUH_FILTER_SIZE_7x7:
 		retval = 3;
 		break;
 	case VPUH_FILTER_SIZE_9x9:
 		retval = 4;
 		break;
 	case VPUH_FILTER_SIZE_11x11:
 		retval = 5;
 		break;
 	default:
 		break;
 	}
 	BUG_ON(retval == 0xFF);
 	return retval;
 }

 /**
  * this function returns index = 1, 2 or 3, according to
  * VPUH_NLF_MODE values - or 0xFF in case of invalid input
  */
 __u32 get_filter_size_idx_for_non_linear_filter(__u32 nlf_mode)
 {
 	__u32 retval = 0xFF;

 	switch (nlf_mode) {
 	case VPUH_NLF_MODE_MIN:
 	case VPUH_NLF_MODE_MAX:
 	case VPUH_NLF_MODE_MED:
 		retval = 1;
 		break;
 	case VPUH_NLF_MODE_FAST:
 		retval = 3;
 		break;
 	case VPUH_NLF_MODE_CENSUS:
 	case VPUH_NLF_MODE_SM_SMOOTH:
 		retval = 2;
 		break;
 	default:
 		break;
 	}
 	return retval;
 }

 void calc_num_mprbs_for_filters(__u32 img_width, __u32 pixel_bytes,
 	__u32 filter_size_index,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	__u32 width_idx;
 	const struct mprb_req_per_pixel_size *mprb_req_per_pix_size;
 	const struct mprb_requirements *mprb_req;

 	width_idx = get_width_index_for_filter_mem_req(img_width);
 	mprb_req_per_pix_size =
 		&mprb_req_per_pixel_size_values[width_idx][filter_size_index];

 	if (pixel_bytes == VPUH_BITS_8_BITS)
 		mprb_req = &mprb_req_per_pix_size->mprb_req_for_8bit_pixel;
 	else if (pixel_bytes == VPUH_BITS_16_BITS)
 		mprb_req = &mprb_req_per_pix_size->mprb_req_for_16bit_pixel;
 	else
 		mprb_req = &mprb_req_per_pix_size->mprb_req_for_32bit_pixel;

 	num_mprbs_needed_calc_result->num_lrg_mprbs = mprb_req->mprb_num_lrg;
 	num_mprbs_needed_calc_result->num_sm_mprbs = mprb_req->mprb_num_sm;
 }

 void get_num_mprbs_acc_to_width_for_non_filters(
 	__u32 width,
 	const struct mprb_needed_per_img_width *mprb_per_image_width,
 	__u32 nbr_entries,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	__u32 i;

 	for (i = 0; i < nbr_entries; i++) {
 		if (width < mprb_per_image_width[i].width_upper_limit)
 			break;
 	}
 	num_mprbs_needed_calc_result->num_lrg_mprbs =
 				mprb_per_image_width[i].nbr_lrg_mprbs;
 	num_mprbs_needed_calc_result->num_sm_mprbs =
 				mprb_per_image_width[i].nbr_sm_mprbs;
 	num_mprbs_needed_calc_result->ram_ports_array =
 				mprb_per_image_width[i].ram_ports_array;
 }

 void vpu_resource_no_mprbs_needed(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 }

 void vpu_resource_calc_num_mprbs_for_nms(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	const union vpul_pu_parameters *pu_params = &pu->params;
 	const struct vpul_pu_nms *nms = &(pu_params->nms);
 	__u32 filter_size_index;

 	filter_size_index =
 		get_filter_size_index_for_linear_filter(nms->support);

 	calc_num_mprbs_for_filters(img_width, nms->bits_in, filter_size_index,
 					num_mprbs_needed_calc_result);
 }

 void vpu_resource_calc_num_mprbs_for_slf(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	const union vpul_pu_parameters *pu_params = &pu->params;
 	const struct vpul_pu_slf *slf = &(pu_params->slf);
 	__u32 filter_size_index = 0;
 	__u32 ram_size_for_1_line = 0;
 	__u32 num_of_line_buffers = 0;

 	if (slf->invert_columns) {
 		ram_size_for_1_line = img_width;
 		if (slf->bits_in == VPUH_BITS_16_BITS)
 			/* double line size if 16 bits per pixel */
 			ram_size_for_1_line <<= 1;
 		/* divide by 1024 to find number of small MPRBs needed */
 		num_of_line_buffers = (ram_size_for_1_line + 1023) >> 10;
 		if (num_of_line_buffers < 4)
 			num_mprbs_needed_calc_result->num_sm_mprbs = num_of_line_buffers;
 		else
 			/* divide by 4096 to find number of large MPRBs needed */
 			num_mprbs_needed_calc_result->num_lrg_mprbs =
 						(ram_size_for_1_line + 4095) >> 12;
 	} else {
 		filter_size_index =
 				get_filter_size_index_for_linear_filter(slf->filter_size_mode);

 		calc_num_mprbs_for_filters(img_width, slf->bits_in, filter_size_index,
 					num_mprbs_needed_calc_result);
 	}
 }

 void vpu_resource_calc_num_mprbs_for_glf5(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	const union vpul_pu_parameters *pu_params = &pu->params;
 	const struct vpul_pu_glf *glf = &(pu_params->glf);
 	__u32 filter_size_index;

 	filter_size_index =
 		get_filter_size_index_for_linear_filter(glf->filter_size_mode);

 	calc_num_mprbs_for_filters(img_width, glf->bits_in, filter_size_index,
 					num_mprbs_needed_calc_result);
 }

 void vpu_resource_calc_num_mprbs_for_nlf(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	const union vpul_pu_parameters *pu_params = &pu->params;
 	const struct vpul_pu_nlf *nlf = &(pu_params->nlf);
 	__u32 filter_size_index;

 	filter_size_index = get_filter_size_idx_for_non_linear_filter(
 					nlf->filter_mode);

 	calc_num_mprbs_for_filters(img_width, nlf->bits_in, filter_size_index,
 					num_mprbs_needed_calc_result);
 }

 void vpu_resource_calc_num_mprbs_for_upsc(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	get_num_mprbs_acc_to_width_for_non_filters(img_width,
 						mprb_per_img_width_upsc, 2,
 						num_mprbs_needed_calc_result);
 }

 void vpu_resource_calc_num_mprbs_for_downsc(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	get_num_mprbs_acc_to_width_for_non_filters(img_width,
 						mprb_per_img_width_downsc, 3,
 						num_mprbs_needed_calc_result);
 }

 void vpu_resource_calc_num_mprbs_for_intimg(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	const union vpul_pu_parameters *pu_params = &pu->params;
 	const struct vpul_pu_integral *intg_pr = &pu_params->integral;

 	switch (intg_pr->integral_image_mode) {
 	case VPUH_INTIMG_MODE_LUT:
 		/* lut size / 2048 */
 		num_mprbs_needed_calc_result->num_lrg_mprbs =
 			(intg_pr->lut_number_of_values + 2047) >> 11;
 		break;
 	case VPUH_INTIMG_MODE_CONNECTED_COMP:
 		get_num_mprbs_acc_to_width_for_non_filters(img_width,
 					mprb_per_img_width_integ_II_or_CC, 3,
 					num_mprbs_needed_calc_result);
 		num_mprbs_needed_calc_result->num_lrg_mprbs +=
 			    ((intg_pr->cc_label_vector_size + 2047) >> 11);
 		break;
 	default:
 		/* II mode */
 		get_num_mprbs_acc_to_width_for_non_filters(img_width,
 					mprb_per_img_width_integ_II_or_CC, 3,
 					num_mprbs_needed_calc_result);
 		break;
 	}
 }

 void vpu_resource_calc_num_mprbs_for_histog(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	num_mprbs_needed_calc_result->num_lrg_mprbs = 2;
 }

 void vpu_resource_calc_num_mprbs_for_dispar(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	num_mprbs_needed_calc_result->num_lrg_mprbs = 2;
 }

 void vpu_resource_calc_num_mprbs_for_inpnt(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	get_num_mprbs_acc_to_width_for_non_filters(img_width,
 						mprb_per_img_width_inpnt, 2,
 						num_mprbs_needed_calc_result);
 }

 void vpu_resource_calc_num_mprbs_for_tn2(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	/* all RAM ports in use */
 	num_mprbs_needed_calc_result->num_lrg_mprbs = 4;
 }

 void vpu_resource_calc_num_mprbs_for_tn3(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	/* all RAM ports in use */
 	num_mprbs_needed_calc_result->num_lrg_mprbs = 4;
 }

 void vpu_resource_calc_num_mprbs_for_tn4(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	/* all RAM ports in use */
 	num_mprbs_needed_calc_result->num_lrg_mprbs = 8;
 }

 void vpu_resource_calc_num_mprbs_for_tn5(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	/* all RAM ports in use */
 	num_mprbs_needed_calc_result->num_lrg_mprbs = 8;
 }

 void vpu_resource_calc_num_mprbs_for_cnn(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	/* all RAM ports in use */
 	num_mprbs_needed_calc_result->num_lrg_mprbs = 20;
 	num_mprbs_needed_calc_result->num_sm_mprbs = 21;
 }

 void vpu_resource_calc_num_mprbs_for_lut(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	num_mprbs_needed_calc_result->num_sm_mprbs = 1;
 }

 void vpu_resource_calc_num_mprbs_for_flmorb(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	num_mprbs_needed_calc_result->num_lrg_mprbs = 3;


 }

 void vpu_resource_calc_num_mprbs_for_fdepth(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	get_num_mprbs_acc_to_width_for_non_filters(img_width,
 						mprb_per_img_width_fdepth, 3,
 						num_mprbs_needed_calc_result);
 }

 void vpu_resource_calc_num_mprbs_for_fifo(const struct vpul_pu *pu,
 	__u32 img_width,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	if (pu->mprb_type == VPUH_MPRB_TYPE_4K)
 		num_mprbs_needed_calc_result->num_lrg_mprbs = pu->n_mprbs;
 	else
 		num_mprbs_needed_calc_result->num_sm_mprbs = pu->n_mprbs;
 }

 static __s32 __vpu_resource_pu_get(struct vpu_hw_pu *pu_device,
 			   struct vpul_pu *pu, __u32 flags)
 {
 	__s32 ret = 0;
 	enum VPU_PU_TYPES block_id;
 	__u32 channel_id;
 	struct vpu_hw_block *block;

 	BUG_ON(!pu);
 	BUG_ON(!pu_device);

 	if (pu->instance >= VPU_PU_NUMBER) {
 		ret = -1;
 		goto p_err;
 	}

 	block_id = pu_inst2type[pu->instance];
 	block = &pu_device->table[block_id];

 	channel_id = pu->instance - block->start;
 	if (test_bit(VPUL_GRAPH_FLAG_FIXED,
 				(const unsigned long *) &flags)) {
 		if (test_bit(channel_id, block->preempted)) {
 			ret = -1;
 			goto p_err;
 		}
 	} else if (test_bit(channel_id, block->preempted)) {
 		channel_id = find_first_zero_bit(block->preempted,
 						 block->total);
 		if (channel_id >= block->total) {
 			ret = -1;
 			goto p_err;
 		}
 		pu->instance =
 			(enum vpul_pu_instance)(block->start + channel_id);
 	}

 	set_bit(channel_id, block->preempted);
 	set_bit(channel_id, block->pre_allocated);

 p_err:
 	return ret;
 }

 static __s32 __vpu_resource_pu_put(struct vpu_hardware *vpu_hw,
 				   struct vpul_pu *pu)
 {
 	__s32 ret = 0;
 	enum VPU_PU_TYPES block_id;
 	__u32 channel_id;
 	struct vpu_hw_block *block;
 	struct vpu_hw_pu *pu_device;

 	BUG_ON(!vpu_hw);
 	BUG_ON(!pu);

 	pu_device = &vpu_hw->pu;

 	if (pu->instance >= VPU_PU_NUMBER) {
 		ret = -1;
 		goto p_err;
 	}

 	block_id = pu_inst2type[pu->instance];
 	block = &pu_device->table[block_id];

 	channel_id = pu->instance - block->start;
 	if (channel_id >= block->total) {
 		/*
 		 * vpu_err("channel_id %d is invalid(%d, %d)\n", channel_id,
 		 * pu->instance, block->start);
 		 */
 		ret = 0 - channel_id;
 		goto p_err;
 	}

 	pu->instance = (enum vpul_pu_instance)(block->start);
 	clear_bit(channel_id, block->allocated);

 p_err:
 	return ret;
 }

 static void __vpu_resource_mprb_put(struct vpu_hardware *vpu_hw,
 				    struct vpul_pu *pu)
 {
 	__u32 i, j;
 	__u32 num_of_mprbs;
 	__u32 mprb_num;
 	struct vpu_hw_block *mprb_block;
 	struct vpu_hw_mprb *mprb_device;

 	BUG_ON(!vpu_hw);
 	BUG_ON(!pu);

 	mprb_device = &vpu_hw->mprb;
 	num_of_mprbs = pu->n_mprbs;
 	if (num_of_mprbs) {
 		j = 0;
 		for (i = 0; i < VPU_MAXIMAL_MPRB_CONNECTED; i++) {
 			mprb_num = pu->mprbs[i];
 			if (mprb_num != NO_MPRB_CONNECTED) {
 				if (mprb_num < VPU_HW_NUM_LARGE_MPRBS)
 					mprb_block =
 					    &mprb_device->table[MPRB_large];
 				else {
 					mprb_block =
 					    &mprb_device->table[MPRB_small];
 					mprb_num -= VPU_HW_NUM_LARGE_MPRBS;
 				}

 				clear_bit(mprb_num, mprb_block->allocated);
 				j++;
 				if (j == num_of_mprbs)
 					break;
 			}
 		}
 	}
 }

 /**
  * This function initializes the "preempted" flags in bitmaps for all PUs
  * and MPRBs
  * it is called for each subchain, before allocation of all PUs and MPRBs
  * needed by the subchain
  * the bits set in the bitmaps indicate resources that are not available for
  * allocation
  * this is done with taking into account the options selected by "flags" :
  * - VPUL_GRAPH_FLAG_SHARED_AMONG_TASKS - resources allocated for other tasks
  *   will be regarded as "available"
  * - VPUL_GRAPH_FLAG_SHARED_AMONG_SUBCHAINS - resources allocated to other
  *   subchains in this task will be regarded as "available"
  */
 static void vpu_resource_init_preempted(struct vpu_hw_pu *pu_device,
 				struct vpu_hw_mprb *mprb_device,
 				__u32 flags)
 {
 	__u32 i;

 	for (i = 0; i < pu_device->total; ++i)
 		bitmap_copy(pu_device->table[i].preempted,
 			    pu_device->table[i].allocated,
 			    pu_device->table[i].total);
 	for (i = 0; i < mprb_device->total; ++i)
 		bitmap_copy(mprb_device->table[i].preempted,
 			    mprb_device->table[i].allocated,
 			    mprb_device->table[i].total);

 /*
  *	if (test_bit(VPUL_GRAPH_FLAG_SHARED_AMONG_TASKS,
  *		(const unsigned long int *)&flags)) {
  *		for (i = 0; i < pu_device->total; ++i)
  *			bitmap_zero(pu_device->table[i].preempted,
  *				    pu_device->table[i].total);
  *		for (i = 0; i < mprb_device->total; ++i)
  *			bitmap_zero(mprb_device->table[i].preempted,
  *				    mprb_device->table[i].total);
  *	}
  */
 	if (!test_bit(VPUL_GRAPH_FLAG_SHARED_AMONG_SUBCHAINS,
 		(const unsigned long *)&flags)) {
 		for (i = 0; i < pu_device->total; ++i)
 			bitmap_or(pu_device->table[i].preempted,
 				  pu_device->table[i].preempted,
 				  pu_device->table[i].pre_allocated,
 				  pu_device->table[i].total);
 		for (i = 0; i < mprb_device->total; ++i)
 			bitmap_or(mprb_device->table[i].preempted,
 				  mprb_device->table[i].preempted,
 				  mprb_device->table[i].pre_allocated,
 				  mprb_device->table[i].total);
 	}
 }

 static __calc_nbr_of_mprbs_needed calc_nbr_of_mprbs_needed[VPU_PU_TYPES_NUMBER]
 	= {
 #define VPU_PU_TYPE(a, b, c) b,
 #include "lib/vpul_pu_types.def"
 #undef VPU_PU_TYPE
 };

 static void __vpu_resource_calc_num_mprbs_needed(
 	struct vpul_pu *pu,
 	__u32 *calculated_sizes,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result)
 {
 	enum VPU_PU_TYPES pu_type;

 	BUG_ON((pu->instance) >= (VPU_PU_NUMBER));
 	pu_type = pu_inst2type[pu->instance];
 	BUG_ON(pu_type >= VPU_PU_TYPES_NUMBER);
 		calc_nbr_of_mprbs_needed[pu_type](pu, calculated_sizes[pu->in_size_idx],
 						num_mprbs_needed_calc_result);
 }

 static void update_availability_matrix(__u8 *ram_port_row_in_matrix,
 					__u32 first_ram_port,
 					__u32 num_large_mprbs_needed,
 					__u32 num_small_mprbs_needed,
 					struct vpu_hw_mprb *mprb_device,
 					const __u32 *ram_ports)
 {
 /**
  * adding rows to availability matrix
  * each row will reflect a PU RAM port and the MPRBs available for this port
  * input :
  * num_large_mprbs_needed : specifies number of rows to add for large MPRBs
  * (1 for each needed large MPRB)
  * num_small_mprbs_needed : specifies number of rows to add for small MPRBs
  * (1 for each needed small MPRB), these rows will follow the rows for large
  * MPRBs (if both "large" and "small" MPRBs are required)
  * the rows to be copied are specified by ram_ports
  * ram_port_row_in_matrix : points to beginning of 1st row to be added
  * first_ram_port : index in interconnect matrix to 1st RAM port of PU instance
  * operation :
  * the function copies num_mprbs_needed rows from interconnect matrix to
  * availability matrix
  * bits for all MPRBs marked as "preempted" will be cleared in the copied rows
  * bits for MPRBs of type (1K/4K) other than needed one will be cleared as well
  * in case mprb_size_type is specified as "both" :
  * bits for "small" MPRBs will be cleared in first num_large_mprbs_needed rows
  * bits for "large" MPRBs will be cleared in the other rows
  */
 	__u32 i, j;
 	__u8 *availability_data;
 	const __u8 *interconnect_data;
 	struct vpu_hw_block *mprb_block;
 	__u32 interconnect_row;
 	__u32 total_num_mprbs_needed;


 	total_num_mprbs_needed = num_large_mprbs_needed +
 					num_small_mprbs_needed;

 	/* fill rows for ports used for large MPRBs (come first) */
 	for (i = 0; i < num_large_mprbs_needed; i++) {
 		interconnect_row = first_ram_port + ram_ports[i];
 		/* mark all small MPRBs as "unavailable" in these rows */
 		availability_data = ram_port_row_in_matrix +
 					OFFSET_TO_SMALL_MEM_BLOCKS;
 		memset(availability_data, 0, VPU_HW_NUM_SMALL_MPRBS);

 		availability_data = ram_port_row_in_matrix +
 					OFFSET_TO_LARGE_MEM_BLOCKS;
 		interconnect_data =
 		    &interconnect_matrix[interconnect_row]
 					[OFFSET_TO_LARGE_MEM_BLOCKS];
 		mprb_block = &mprb_device->table[MPRB_large];
 		for (j = 0; j < VPU_HW_NUM_LARGE_MPRBS; j++) {
 			if ((*interconnect_data) &&
 			    (!test_bit(j, mprb_block->preempted)))
 				*availability_data = 1;
 			else
 				*availability_data = 0;
 			interconnect_data++;
 			availability_data++;
 		}
 		ram_port_row_in_matrix += VPU_HW_TOT_NUM_MPRBS;
 	}

 	/* then fill next rows for ports used for small MPRBs */
 	for (i = num_large_mprbs_needed; i < total_num_mprbs_needed; i++) {
 		interconnect_row = first_ram_port + ram_ports[i];
 		/* mark all large MPRBs as "unavailable" in these rows */
 		availability_data = ram_port_row_in_matrix +
 			OFFSET_TO_LARGE_MEM_BLOCKS;
 		memset(availability_data, 0, VPU_HW_NUM_LARGE_MPRBS);

 		availability_data = ram_port_row_in_matrix +
 			OFFSET_TO_SMALL_MEM_BLOCKS;
 		interconnect_data =
 		    &interconnect_matrix[interconnect_row]
 			[OFFSET_TO_SMALL_MEM_BLOCKS];
 		mprb_block = &mprb_device->table[MPRB_small];
 		for (j = 0; j < VPU_HW_NUM_SMALL_MPRBS; j++) {
 			if ((*interconnect_data) &&
 				(!test_bit(j, mprb_block->preempted)))
 				*availability_data = 1;
 			else
 				*availability_data = 0;
 			interconnect_data++;
 			availability_data++;
 		}
 		ram_port_row_in_matrix += VPU_HW_TOT_NUM_MPRBS;
 		interconnect_row++;
 	}
 }

 static __s32 prepare_mprb_alloc_for_pu(
 	__u32 cumul_num_mprbs,
 	struct result_of_calc_num_mprbs_needed *num_mprbs_needed_calc_result,
 	struct vpul_pu *pu,
 	__u8 *ram_port_row_in_matrix,
 	struct vpu_hw_mprb *mprb_device,
 	struct port_index_2_pu *port_index_to_pu)
 {
 	__u32 first_ram_port;
 	__u32 num_mprbs_large;
 	__u32 num_mprbs_small;
 	__u32 number_of_ram_ports;
 	__u32 i;
 	__s32 ret = 0;
 	__u32 num_mprbs_needed;
 	const __u32 *mem_ports_array;

 	mem_ports_array = num_mprbs_needed_calc_result->ram_ports_array;
 	num_mprbs_large = num_mprbs_needed_calc_result->num_lrg_mprbs;
 	num_mprbs_small = num_mprbs_needed_calc_result->num_sm_mprbs;
 	num_mprbs_needed = num_mprbs_large + num_mprbs_small;

 	/* initialize all MPRB entries to "not in use" */
 	for (i = 0; i < VPU_MAXIMAL_MPRB_CONNECTED; i++)
 		pu->mprbs[i] = NO_MPRB_CONNECTED;

 	number_of_ram_ports =
 		pu_inst_2_ram_port[pu->instance].number_of_ram_ports;

 	if (num_mprbs_needed > number_of_ram_ports) {
 		/* failure */
 		ret = -1;
 		goto p_err;
 	}
 	if ((cumul_num_mprbs + num_mprbs_needed) > VPU_HW_TOT_NUM_MPRBS) {
 		/* failure */
 		ret = -1;
 		goto p_err;
 	}
 	first_ram_port = pu_inst_2_ram_port[pu->instance].first_ram_port;

 	if (num_mprbs_large)
 		pu->mprb_type = VPUH_MPRB_TYPE_4K;
 	else
 		pu->mprb_type = VPUH_MPRB_TYPE_1K;

 	update_availability_matrix(ram_port_row_in_matrix,
 				first_ram_port,
 				num_mprbs_large, num_mprbs_small,
 				mprb_device,
 				mem_ports_array);

 	for (i = 0; i < num_mprbs_needed; i++) {
 		port_index_to_pu[cumul_num_mprbs + i].pu = pu;
 		port_index_to_pu[cumul_num_mprbs + i].port_index_in_pu =
 							mem_ports_array[i];
 	}
 p_err:
 	return ret;
 }

 static void allocate_mprb(__u32 port_num, __u32 mprb_num,
 			  struct vpu_hw_mprb *mprb_device,
 			  struct port_index_2_pu *port_index_to_pu)
 {
 	struct vpu_hw_block *mprb_block;
 	struct vpul_pu *pu_for_this_port;
 	__u32 port_num_for_pu;

 	/* retrieve PU descr for which this MPRB is allocated */
 	pu_for_this_port = port_index_to_pu[port_num].pu;
 	port_num_for_pu = port_index_to_pu[port_num].port_index_in_pu;
 	/* write MPRB num to mprbs array for PU, at index = port index */
 	pu_for_this_port->mprbs[port_num_for_pu] = mprb_num;

 	if (mprb_num < VPU_HW_NUM_LARGE_MPRBS)
 		mprb_block = &mprb_device->table[MPRB_large];
 	else {
 		mprb_block = &mprb_device->table[MPRB_small];
 		mprb_num -= VPU_HW_NUM_LARGE_MPRBS;
 	}
 	set_bit(mprb_num, mprb_block->preempted);
 	set_bit(mprb_num, mprb_block->pre_allocated);
 }

 /**
  * this function is called after allocating an MPRB, in case of port with only
  * 1 suitable MPRB
  * its main task is to zero in availability matrix the row and column for
  * port / MPRB which was allocated (specified at input as i and j)
  * in the process, it also decrements "number of suitable MPRBs" for all ports
  * for which the allocated MPRB was previously available; if, in this process,
  * one of the "number of suitable MPRBs" values is decremented to 1, and its
  * port index k in availability matrix is < current port index (which is i),
  * this index will be the value returned by this function; otherwise, it will
  * return value of i itself.
  */
 static __u32 upd_avail_matrix_after_alloc_0(
 			__u8 availability_matrix[][VPU_HW_TOT_NUM_MPRBS],
 			__u32 total_num_mprbs_needed,
 			__u32 i, __u32 j,
 			__u32 *total_mprbs_per_port,
 			__u32 *total_ports_per_mprb)
 {
 	__u32 retval;
 	__u32 k;

 	retval = i;
 	total_ports_per_mprb[j] = 0;
 	for (k = 0; k < total_num_mprbs_needed; k++) {
 		if (availability_matrix[k][j]) {
 			availability_matrix[k][j] = 0;
 			total_mprbs_per_port[k]--;
 			/* retval shall be modified at most once */
 			if ((total_mprbs_per_port[k] == 1) && (k < retval))
 				retval = k;
 		}
 	}
 	return retval;
 }

 /**
  * this function is called after allocating an MPRB, in case of MPRB with only
  * 1 port for which this MPRB is suitable
  * its main task is to zero in availability matrix the row and column for
  * port / MPRB which was allocated (specified at input as i and j)
  * in the process, it also decrements "number of ports" for all MPRBs that
  * could be used by this port (MPRBs for which there is a 1 in row i); if, in
  * this process, one of the "number of ports" values is decremented to 1, and
  * MPRB index k in availability matrix is < current MPRB index (which is j),
  * the value returned by this function will be k; otherwise, it will return
  * value of j itself.
  */
 static __u32 upd_avail_matrix_after_alloc_1(
 			__u8 availability_matrix[][VPU_HW_TOT_NUM_MPRBS],
 			__u32 i, __u32 j,
 			__u32 *total_mprbs_per_port,
 			__u32 *total_ports_per_mprb)
 {
 	__u32 retval;
 	__u32 k;

 	retval = j;
 	total_mprbs_per_port[i] = 0;
 	for (k = 0; k < VPU_HW_TOT_NUM_MPRBS; k++) {
 		if (availability_matrix[i][k]) {
 			availability_matrix[i][k] = 0;
 			total_ports_per_mprb[k]--;
 			/* retval shall be modified at most once */
 			if ((total_ports_per_mprb[k] == 1) && (k < retval))
 				retval = k;
 		}
 	}
 	return retval;
 }

 /**
  * this function is called after allocating an MPRB, in case of MPRB with
  * minimum number of ports (but > 1) for which this MPRB is suitable.
  * its main task is to zero in availability matrix the row for the port
  * to which this MPRB was allocated (specified at input as i)
  * in the process, it also decrements "number of ports" for all MPRBs that
  * could be used by this port (MPRBs for which there is a 1 in row i); if, in
  * this process, one of the "number of ports" values is decremented to 1, the
  * smallest value of index of such an MPRB is returned; otherwise, it returns
  * value 0xFF.
  */
 static __u32 upd_avail_matrix_after_alloc_2(
 			__u8 availability_matrix[][VPU_HW_TOT_NUM_MPRBS],
 			__u32 i, __u32 *total_mprbs_per_port,
 			__u32 *total_ports_per_mprb)
 {
 	__u32 retval;
 	__u32 k;

 	retval = 0xFF;
 	total_mprbs_per_port[i] = 0;
 	for (k = 0; k < VPU_HW_TOT_NUM_MPRBS; k++) {
 		if (availability_matrix[i][k]) {
 			availability_matrix[i][k] = 0;
 			total_ports_per_mprb[k]--;
 			if ((total_ports_per_mprb[k] == 1) && (retval == 0xFF))
 				retval = k;
 		}
 	}
 	return retval;
 }

 /**
  * this function is called after allocating an MPRB, in case of MPRB with
  * minimum number of ports (but > 1) for which this MPRB is suitable.
  * its main task is to zero in availability matrix the column of the MPRB that
  * was allocated (specified at input as j)
  * in the process, it also decrements "number of suitable MPRBs" for all ports
  * that could have used this MPRB (ports for which there is a 1 in column j);
  * if, in this process, one of the "number of suitable MPRBs" values is
  * decremented to 1, the smallest value of index of such a port is returned;
  * otherwise, it returns value 0xFF.
  */
 static __u32 upd_avail_matrix_after_alloc_3(
 			__u8 availability_matrix[][VPU_HW_TOT_NUM_MPRBS],
 			__u32 j, __u32 total_num_mprbs_needed,
 			__u32 *total_mprbs_per_port,
 			__u32 *total_ports_per_mprb)
 {
 	__u32 retval;
 	__u32 k;

 	retval = 0xFF;
 	total_ports_per_mprb[j] = 0;
 	for (k = 0; k < total_num_mprbs_needed; k++) {
 		if (availability_matrix[k][j]) {
 			availability_matrix[k][j] = 0;
 			total_mprbs_per_port[k]--;
 			if ((total_mprbs_per_port[k] == 1) && (retval == 0xFF))
 				retval = k;
 		}
 	}
 	return retval;
 }

 static __s32 __vpu_resource_mprbs_get(
 			__u8 availability_matrix[][VPU_HW_TOT_NUM_MPRBS],
 			__u32 total_num_mprbs_needed,
 			struct vpu_hw_mprb *mprb_device,
 			struct port_index_2_pu *port_index_to_pu)
 {
 	/**
 	 * used for comparing to total_num_mprbs_needed (successful
 	 * completion), and also for back-tracking
 	 */
 	__u32 num_mprbs_allocated;
 	__u32 total_mprbs_per_port[VPU_HW_TOT_NUM_MPRBS];
 	__u32 total_ports_per_mprb[VPU_HW_TOT_NUM_MPRBS];
 	__u32 i, j, k;
 	__s32 ret = 0;
 	__u32 min_num_ports_per_mprb;
 	__u32 mprb_with_min_num_ports;
 	__u32 index_of_port_with_1_mprb;
 	__u32 index_of_mprb_with_1_port;

 	num_mprbs_allocated = 0;

 	BUG_ON( total_num_mprbs_needed > VPU_HW_TOT_NUM_MPRBS);

 	/* calculate and store totals per row (for all ports) */
 	for (i = 0; i < total_num_mprbs_needed; i++) {
 		total_mprbs_per_port[i] = 0;
 		for (j = 0; j < VPU_HW_TOT_NUM_MPRBS; j++)
 			total_mprbs_per_port[i] +=
 				availability_matrix[i][j];
 	}

 	/* calculate and store totals per column (for all MPRBs) */
 	for (j = 0; j < VPU_HW_TOT_NUM_MPRBS; j++) {
 		total_ports_per_mprb[j] = 0;
 		for (i = 0; i < total_num_mprbs_needed; i++)
 			total_ports_per_mprb[j] +=
 				availability_matrix[i][j];
 	}

 	index_of_port_with_1_mprb = 0;
 	/**
 	 * the following loop is executed as long as there are ports for which
 	 * there is only 1 MPRB available (total_mprbs_per_port = 1)
 	 */
 find_port_with_1_mprb:
 	BUG_ON( total_num_mprbs_needed > VPU_HW_TOT_NUM_MPRBS);
 	for (i = index_of_port_with_1_mprb; i < total_num_mprbs_needed;) {
 		k = i;
 		if (total_mprbs_per_port[i] == 1) {
 			for (j = 0; j < VPU_HW_TOT_NUM_MPRBS; j++) {
 				/* find THE mprb available for this port */
 				if (availability_matrix[i][j]) {
 					allocate_mprb(i, j, mprb_device,
 						      port_index_to_pu);
 					num_mprbs_allocated++;
 					if (num_mprbs_allocated ==
 						total_num_mprbs_needed)
 						/* successfully completed */
 						goto end_vpu_rsrc_mprbs_get;
 					/* remove MPRB from
 					 * availability matrix
 					 */
 					k = upd_avail_matrix_after_alloc_0(
 						availability_matrix,
 						total_num_mprbs_needed, i, j,
 						total_mprbs_per_port,
 						total_ports_per_mprb);
 					break;
 				}
 			}
 		}
 		/**
 		 * if found port #k for which number of suitable MPRBs was
 		 * decremented to 1 : proceed from port #k
 		 */
 		if (k == i)
 			i++;
 		else
 			i = k;
 	}
 	index_of_mprb_with_1_port = 0;

 	/**
 	 * the following loop is executed as long as there are mprbs available
 	 * to only 1 port (total_ports_per_mprb == 1)
 	 */
 find_mprb_with_1_port:
 	BUG_ON( total_num_mprbs_needed > VPU_HW_TOT_NUM_MPRBS);
 	for (j = index_of_mprb_with_1_port; j < VPU_HW_TOT_NUM_MPRBS;) {
 		k = j;
 		if (total_ports_per_mprb[j] == 1) {
 			for (i = 0; i < total_num_mprbs_needed; i++) {
 				/* find THE port which can use this MPRB */
 				if (availability_matrix[i][j]) {
 					allocate_mprb(i, j, mprb_device,
 						      port_index_to_pu);
 					num_mprbs_allocated++;
 					if (num_mprbs_allocated ==
 						total_num_mprbs_needed)
 						/* successfully completed */
 						goto end_vpu_rsrc_mprbs_get;
 					k = upd_avail_matrix_after_alloc_1(
 					    availability_matrix, i, j,
 					    total_mprbs_per_port,
 					    total_ports_per_mprb);
 					break;
 				}
 			}
 		}
 		/**
 		 * if found MPRB #k with number of ports for which this MPRB is
 		 * suitable was decremented to 1 : proceed from MPRB #k
 		 */
 		if (k == j)
 			j++;
 		else
 			j = k;
 	}

 	/* find mprb with minimum number of ports */
 find_mprb_with_min_num_ports:
 	min_num_ports_per_mprb = 0xFF;
 	mprb_with_min_num_ports = 0xFF;

 	for (j = 0; j < VPU_HW_TOT_NUM_MPRBS; j++) {
 		if ((total_ports_per_mprb[j] < min_num_ports_per_mprb) &&
 		    (total_ports_per_mprb[j] != 0)) {
 			min_num_ports_per_mprb = total_ports_per_mprb[j];
 			mprb_with_min_num_ports = j;
 		}
 	}
 	if (mprb_with_min_num_ports == 0xFF) {
 		/* no more MPRBs available : alloc failed */
 		ret = -1;
 		goto end_vpu_rsrc_mprbs_get;
 	}
 	j = mprb_with_min_num_ports;
 	/**
 	 * found mprb with minimum number of ports (but non-zero) : allocate it
 	 * to first suitable port
 	 */
 	for (i = 0; i < total_num_mprbs_needed; i++) {
 		if (availability_matrix[i][j])
 			break;
 	}
 	allocate_mprb(i, j, mprb_device, port_index_to_pu);
 	num_mprbs_allocated++;
 	if (num_mprbs_allocated == total_num_mprbs_needed)
 		/* successfully completed */
 		goto end_vpu_rsrc_mprbs_get;

 	/* remove MPRB from availability matrix */
 	availability_matrix[i][j] = 0;

 	/* remove this row from availability matrix */
 	index_of_mprb_with_1_port = upd_avail_matrix_after_alloc_2(
 			availability_matrix, i,
 			total_mprbs_per_port, total_ports_per_mprb);

 	/* remove this column from availability matrix */
 	index_of_port_with_1_mprb = upd_avail_matrix_after_alloc_3(
 			availability_matrix, j, total_num_mprbs_needed,
 			total_mprbs_per_port, total_ports_per_mprb);


 	/* found a port for which only 1 MPRB is available ? */
 	if (index_of_port_with_1_mprb != 0xFF)
 		goto find_port_with_1_mprb;
 	/* found an MPRB available to only 1 port ? */
 	if (index_of_mprb_with_1_port != 0xFF)
 		goto find_mprb_with_1_port;
 	goto find_mprb_with_min_num_ports;
 end_vpu_rsrc_mprbs_get:
 	return ret;
 }


 static __u8 availability_matrix[VPU_HW_TOT_NUM_MPRBS][VPU_HW_TOT_NUM_MPRBS];
 static __u32 actual_sizes[VPUL_MAX_SIZES_OP];

 static __s32 vpu_resource_get_for_vertex(struct vpul_task *task,
 					__u32 flags,
 					struct vpul_vertex *vertex,
 					const struct vpul_3dnn_process_base *proc_3dnn_base,
 					struct vpu_hw_pu *pu_device,
 					struct vpu_hw_mprb *mprb_device)
 {
 	struct result_of_calc_num_mprbs_needed mprbs_needed;
 	__u32 j, k, subchain_cnt, pu_cnt, orig_pu_count;
 	__u32 num_mprbs_needed;
 	__u32 cumul_n_mprbs_needed;
 	struct vpul_subchain *subchain;
 	struct vpul_pu *pu;
 	__u32 flags_copy;
 	struct port_index_2_pu port_index_to_pu[VPU_HW_TOT_NUM_MPRBS];

 	/* used for accessing vpul_pu and updating its contents when allocating MPRB */
 	__s32 ret = 0;

 	subchain_cnt = vertex->num_of_subchains;
 	subchain = fst_vtx_sc_ptr(task, vertex);
 	if (vertex->vtype == VPUL_VERTEXT_3DNN_PROC)
 		ret = set_actual_sizes_3dnn(task, proc_3dnn_base, actual_sizes);
 	else
 		ret = set_actual_sizes(task, vertex, actual_sizes);
 	if (ret)
 		goto p_err;
 	for (j = 0; j < subchain_cnt; j++, subchain++) {
 		if (subchain->stype != VPUL_SUB_CH_CPU_OP) {
 			/* mark "preempted" (= non-allocatable) resources */
 			vpu_resource_init_preempted(pu_device, mprb_device, flags);
 			orig_pu_count = subchain->num_of_pus;
 			if (!test_bit(VPUL_GRAPH_FLAG_DSBL_LATENCY_BALANCING,
 					(const unsigned long *) &flags)) {
 				ret = latency_balancing(task, vertex, subchain, actual_sizes);
 				if (ret)
 					goto p_err;
 			}
 			/* new count includes PUs added for delay balancing */
 			pu_cnt = subchain->num_of_pus;
 			pu = fst_sc_pu_ptr(task, subchain);
 			cumul_n_mprbs_needed = 0;
 			for (k = 0; k < pu_cnt; k++, pu++) {
 				flags_copy = flags;
 				/* for PUs added for delay balancing :
 				 * ignore VPUL_GRAPH_FLAG_FIXED
 				 */
 				if (k >= orig_pu_count)
 					clear_bit(VPUL_GRAPH_FLAG_FIXED,
 						(volatile unsigned long *)&flags_copy);
 				ret = __vpu_resource_pu_get(pu_device, pu, flags_copy);
 				if (ret)
 					goto p_err;
 				/**
 				 * initializing result structure to default values,
 				 * __vpu_resource_calc_num_mprbs_needed will update
 				 * only members of this structure whose values are
 				 * different from default values
 				 */
 				mprbs_needed.num_lrg_mprbs = 0;
 				mprbs_needed.num_sm_mprbs = 0;
 				mprbs_needed.ram_ports_array = consecutive_mem_ports;

 				/* for PUs added for delay balancing : number of MPRBs needed
 				 * already been calculated by delay balancing function
 				 */
 				if (k < orig_pu_count)
 					__vpu_resource_calc_num_mprbs_needed(pu, actual_sizes, &mprbs_needed);
 				else if (pu->mprb_type == VPUH_MPRB_TYPE_4K)
 					mprbs_needed.num_lrg_mprbs = pu->n_mprbs;
 				else
 					mprbs_needed.num_sm_mprbs = pu->n_mprbs;

 				if (pu->instance ==
 						((flags >> VPUL_STATIC_ALLOC_PU_INSTANCE_LSB) &
 							VPUL_STATIC_ALLOC_PU_INSTANCE_MASK)) {
 					if (flags & VPUL_STATIC_ALLOC_LARGE_INSTEAD_SMALL_MPRB_MASK)
 					{
 						if (mprbs_needed.num_lrg_mprbs == 0) {
 							mprbs_needed.num_lrg_mprbs = mprbs_needed.num_sm_mprbs;
 							mprbs_needed.num_sm_mprbs = 0;
 						}
 					}
 				}

 				num_mprbs_needed = mprbs_needed.num_lrg_mprbs +
 							mprbs_needed.num_sm_mprbs;
 				pu->n_mprbs = num_mprbs_needed;
 				if (num_mprbs_needed) {
 					ret = prepare_mprb_alloc_for_pu(
 						cumul_n_mprbs_needed,
 						&mprbs_needed, pu,
 						&availability_matrix[cumul_n_mprbs_needed][0],
 						mprb_device,
 						port_index_to_pu);
 					if (ret)
 						goto p_err;
 					cumul_n_mprbs_needed += num_mprbs_needed;
 				}
 			}
 			/**
 			 * MPRBs allocation is performed after allocating all PU instances for
 			 * subchain and calculating the number of MPRBs needed for each of them
 			 */
 			if (cumul_n_mprbs_needed) {
 				ret = __vpu_resource_mprbs_get(availability_matrix,
 								cumul_n_mprbs_needed,
 								mprb_device,
 								port_index_to_pu);
 				if (ret)
 					goto p_err;
 			}
 		}
 	}
 p_err:
 	if (ret == 0)
 		ret = VPU_STATUS_SUCCESS;
 	else if (ret != VPU_STATUS_BAD_PARAMS)
 		ret = VPU_STATUS_FAILURE;
 	return ret;
 }

 __s32 __vpu_resource_get(struct vpu_hardware *vpu_hw, struct vpul_task *task, __u32 flags)
 {
 	__u32 i;
 	struct vpu_hw_pu *pu_device;
 	struct vpu_hw_mprb *mprb_device;
 	struct vpul_vertex *vertex;
 	const struct vpul_3dnn_process_base *proc_3dnn_base;
 	__s32 ret = VPU_STATUS_SUCCESS;

 	if ((vpu_hw) && (task)) {
 		pu_device = &vpu_hw->pu;
 		mprb_device = &vpu_hw->mprb;
 		vertex = fst_vtx_ptr(task);
 		proc_3dnn_base = fst_3dnn_process_base_ptr(task);

 		/* prepare */
 		for (i = 0; i < pu_device->total; ++i)
 			bitmap_zero(pu_device->table[i].pre_allocated,
 					pu_device->table[i].total);
 		for (i = 0; i < mprb_device->total; ++i)
 			bitmap_zero(mprb_device->table[i].pre_allocated,
 					mprb_device->table[i].total);

 		/* estimation */
 		for (i = 0; i < task->t_num_of_vertices; i++, vertex++) {
 			/* skip vertex if not of type "process"
 			 *(3DNN processes handled separately)
 			 */
 			if (vertex->vtype == VPUL_VERTEXT_PROC) {
 				ret = vpu_resource_get_for_vertex(task,
 								flags,
 								vertex,
 								NULL,
 								pu_device,
 								mprb_device);
 				if (ret != VPU_STATUS_SUCCESS)
 					break;
 			}
 		}
 		if (ret == VPU_STATUS_SUCCESS) {
 			for (i = 0; i < task->t_num_of_3dnn_process_bases; i++, proc_3dnn_base++) {
 				vertex = vertex_referencing_this_3dnn_proc_base(task, i);
 				if (vertex) {
 					ret = vpu_resource_get_for_vertex(task,
 									flags,
 									vertex,
 									proc_3dnn_base,
 									pu_device,
 									mprb_device);
 					if (ret != VPU_STATUS_SUCCESS)
 						break;
 				}
 			}
 			if (ret == VPU_STATUS_SUCCESS) {
 				/* acquire */
 				for (i = 0; i < pu_device->total; ++i)
 					bitmap_or(pu_device->table[i].allocated,
 						pu_device->table[i].allocated,
 						pu_device->table[i].pre_allocated,
 						pu_device->table[i].total);
 				for (i = 0; i < mprb_device->total; ++i)
 					bitmap_or(mprb_device->table[i].allocated,
 						mprb_device->table[i].allocated,
 						mprb_device->table[i].pre_allocated,
 						mprb_device->table[i].total);
 			}
 		}
 	} else
 		ret = VPU_STATUS_BAD_PARAMS;

 	return ret;
 }

 __s32 __vpu_resource_put(struct vpu_hardware *vpu_hw,
 				struct vpul_task *task)
 {
 	__u32 i, j, subchain_cnt, pu_cnt;
 	struct vpul_subchain *subchain;
 	struct vpul_pu *pu;
 	__s32 ret = 0;

 	if ((!vpu_hw) || (!task)) {
 		ret = VPU_STATUS_BAD_PARAMS;
 		goto p_err;
 	}

 	subchain_cnt = task->t_num_of_subchains;
 	subchain = fst_sc_ptr(task);
 	for (i = 0; i < subchain_cnt; i++, subchain++) {
 		pu_cnt = subchain->num_of_pus;
 		if(subchain->stype!=VPUL_SUB_CH_CPU_OP){
 			pu = fst_sc_pu_ptr(task, subchain);
 			for (j = 0; j < pu_cnt; j++, pu++) {
 				ret = __vpu_resource_pu_put(vpu_hw, pu);
 				if (ret)
 					goto p_err;
 				__vpu_resource_mprb_put(vpu_hw, pu);
 			}
 		}
 	}
 p_err:
 	if (ret == 0)
 		ret = VPU_STATUS_SUCCESS;
 	else if (ret != VPU_STATUS_BAD_PARAMS)
 		ret = VPU_STATUS_FAILURE;
 	return ret;
 }