 my test code with language scala ,this is one predict, we will predict 100 QPS for a docker val config = ConfigProto.newBuilder .putDeviceCount("CPU", Runtime.getRuntime.availableProcessors) .setInterOpParallelismThreads(8) .setIntraOpParallelismThreads(8) .setOperationTimeoutInMs(3000) .build val options = RunOptions.newBuilder .setTimeoutInMs(5000) .build val modelBundle = SavedModelBundle .loader(s"$path") .withTags("serve") .withConfigProto(config.toByteArray) .withRunOptions(options.toByteArray) .load val kernel = modelBundle.session val data = Map("tensor1" -> Seq(0.1f,0.122f),……) val runner = kernel.runner() val inputTensorList: util.ArrayList[Tensor[java.lang.Float]] = new util.ArrayList[Tensor[java.lang.Float]]() data.map{ case (tensorName, featureId) => { val dataInput:FloatBuffer = FloatBuffer.allocate(featureId.size) featureId.foreach(featureValue => { dataInput.put(featureValue) }) dataInput.asInstanceOf[Buffer].flip() val tensorShape:Array[Long] = Array(1,featureId.size) val tensor = Tensor.create(tensorShape,dataInput) runner.feed(tensorName,tensor) inputTensorList.add(tensor) } } for(i <- 0 until 2 ){ runner.fetch("StatefulPartitionedCall",i) } val output = runner.run.asScala val scores:Array[Float] = output.map(ten => { val tensorData: Array[Array[Float]] = ten.copyTo(Array.ofDim[Float](ten.shape()(0).toInt, ten.shape()(1).toInt)) tensorData(0).head }).toArray inputTensorList.asScala.foreach(_.close()) output.foreach(_.close())