我利用前置TrueDepth摄像头与Vision相结合来识别图像中的点并进行一些测量.我知道Vision坐标是标准化的,所以我将Vision标准化的点转换为与View对应的CG点,然后将这些点与dataOutputSynchronizer中的depthData匹配以获得z值.然后使用相机的内部函数,我试图得到3D空间中两点之间的距离.

我已经成功地找到了点,并(我相信)将它们转换为屏幕点.我在这里的 idea 是,这些CGPoints将没有什么不同,如果我点击他们在屏幕上.

我的问题是,即使转换后的CGPoints基本保持相似(我的手在测试过程中确实会移动一点,但大部分时间都是在摄像头的平面上),而且我试图以同样的方式计算深度位置,深度可能会有很大的不同--特别是第二点.深度点2在计算距离方面似乎更准确(我的手距离摄像头约1英尺),但它变化很大,仍然不准确.

以下是包含相关数据的控制台打印

there are 2 points found
recognized points
[(499.08930909633636, 634.0807711283367), (543.7462849617004, 1061.8824380238852)]
DEPTH POINT 1 =  3.6312041
DEPTH POINT 2 =  0.2998223

there are 2 points found
recognized points
[(498.33644700050354, 681.3769372304281), (602.3667773008347, 1130.4955183664956)]
DEPTH POINT 1 =  3.6276162
DEPTH POINT 2 =  0.560331

以下是一些相关代码.

dataOutputSynchronizer

func dataOutputSynchronizer(_ synchronizer: AVCaptureDataOutputSynchronizer,
                                didOutput synchronizedDataCollection: AVCaptureSynchronizedDataCollection) {
        
        var handPoints: [CGPoint] = []
        
        // Read all outputs
        guard renderingEnabled,
            let syncedDepthData: AVCaptureSynchronizedDepthData =
            synchronizedDataCollection.synchronizedData(for: depthDataOutput) as? AVCaptureSynchronizedDepthData,
            let syncedVideoData: AVCaptureSynchronizedSampleBufferData =
            synchronizedDataCollection.synchronizedData(for: videoDataOutput) as? AVCaptureSynchronizedSampleBufferData else {
                // only work on synced pairs
                return
        }
        
        if syncedDepthData.depthDataWasDropped || syncedVideoData.sampleBufferWasDropped {
            return
        }
        
        let depthPixelBuffer = syncedDepthData.depthData.depthDataMap
        guard let videoPixelBuffer = CMSampleBufferGetImageBuffer(syncedVideoData.sampleBuffer) else {
            return
        }
        
        // Get the cameraIntrinsics
        guard let  cameraIntrinsics = syncedDepthData.depthData.cameraCalibrationData?.intrinsicMatrix else {
            return
        }
        
        let image = CIImage(cvPixelBuffer: videoPixelBuffer)
        
        let handler = VNImageRequestHandler(
           cmSampleBuffer: syncedVideoData.sampleBuffer,
           orientation: .up,
           options: [:]
         )
        
         do {
           try handler.perform([handPoseRequest])
           guard
             let results = handPoseRequest.results?.prefix(2),
             !results.isEmpty
           else {
             return
           }

            var recognizedPoints: [VNRecognizedPoint] = []

             try results.forEach { observation in
               let fingers = try observation.recognizedPoints(.all)

               if let middleTipPoint = fingers[.middleDIP] {
                 recognizedPoints.append(middleTipPoint)
               }

               if let wristPoint = fingers[.wrist] {
                 recognizedPoints.append(wristPoint)
               }
             }

             // Store the Points in handPoints if they are confident points
             handPoints = recognizedPoints.filter {
               $0.confidence > 0.90
             }
             .map {
               // Adjust the Y
               CGPoint(x: $0.location.x, y: 1 - $0.location.y)
             }
             
             // Process the Points Found
             DispatchQueue.main.sync {
              self.processPoints(handPoints,depthPixelBuffer,videoPixelBuffer,cameraIntrinsics)
             }
         } catch {
             // Be more graceful here 
         }
    }

Process Points

func processPoints(_ handPoints: [CGPoint],_ depthPixelBuffer: CVImageBuffer,_ videoPixelBuffer: CVImageBuffer,_ cameraIntrinsics: simd_float3x3) {

        // This converts the normalized point to screen points
        // cameraView.previewLayer is a AVCaptureVideoPreviewLayer inside a UIView
        let convertedPoints = handPoints.map {
            cameraView.previewLayer.layerPointConverted(fromCaptureDevicePoint: $0)
        }
       
        // We need 2 hand points to get the distance 
        if handPoints.count == 2 {
            print("there are 2 points found");
            print("recognized points")
            print(convertedPoints)
            
            let handVisionPoint1 = convertedPoints[0]
        
            let handVisionPoint2 = convertedPoints[1]
            
            let scaleFactor = CGFloat(CVPixelBufferGetWidth(depthPixelBuffer)) / CGFloat(CVPixelBufferGetWidth(videoPixelBuffer))
            
            CVPixelBufferLockBaseAddress(depthPixelBuffer, .readOnly)
            let floatBuffer = unsafeBitCast(CVPixelBufferGetBaseAddress(depthPixelBuffer), to: UnsafeMutablePointer<Float32>.self)
            
            let width = CVPixelBufferGetWidth(depthPixelBuffer)
            let height = CVPixelBufferGetHeight(depthPixelBuffer)
            
            let handVisionPixelX = Int((handVisionPoint1.x * scaleFactor).rounded())
            let handVisionPixelY = Int((handVisionPoint1.y * scaleFactor).rounded())
            
            let handVisionPixe2X = Int((handVisionPoint2.x * scaleFactor).rounded())
            let handVisionPixe2Y = Int((handVisionPoint2.y * scaleFactor).rounded())
            
            CVPixelBufferLockBaseAddress(depthPixelBuffer, .readOnly)
            
            let rowDataPoint1 = CVPixelBufferGetBaseAddress(depthPixelBuffer)! + handVisionPixelY * CVPixelBufferGetBytesPerRow(depthPixelBuffer)
            let handVisionPoint1Depth = rowDataPoint1.assumingMemoryBound(to: Float32.self)[handVisionPixelX]
            
            print("DEPTH POINT 1 = ", handVisionPoint1Depth)
            
            let rowDataPoint2 = CVPixelBufferGetBaseAddress(depthPixelBuffer)! + handVisionPixe2Y * CVPixelBufferGetBytesPerRow(depthPixelBuffer)
            let handVisionPoint2Depth = rowDataPoint2.assumingMemoryBound(to: Float32.self)[handVisionPixe2X]
            
            print("DEPTH POINT 2 = ", handVisionPoint2Depth)
            //Int((width - touchPoint.x) * (height - touchPoint.y))
}

在我的脑海中,我现在认为我在深度图中找到正确像素的逻辑是不正确的.如果不是这样,那么我想知道数据流是否不同步.但老实说,我现在只是有点迷路.感谢您的帮助!

推荐答案

答案最终变得相当简单.我在Apple Dev论坛上有Reality-Dev's post个(以及他的Body Tracking git个),感谢他们为我指明了方向.

在这行中,我将标准化视点转换到屏幕上:

 let convertedPoints = handPoints.map {
            cameraView.previewLayer.layerPointConverted(fromCaptureDevicePoint: $0)
        }

这是主要的问题.我需要归一化的分数.一个非常新手的错误.此代码提供到Vision Points的准确深度图距离:

if handPoints.count == 2 {
            
            let handVisionPoint1 = handPoints[0]
            let handVisionPoint2 = handPoints[1]
            
           
            CVPixelBufferLockBaseAddress(depthPixelBuffer, .readOnly)
            let floatBuffer = unsafeBitCast(CVPixelBufferGetBaseAddress(depthPixelBuffer), to: UnsafeMutablePointer<Float32>.self)
            
            let width = CVPixelBufferGetWidth(depthPixelBuffer)
            let height = CVPixelBufferGetHeight(depthPixelBuffer)
            
            
            let colPosition1 = Int(handVisionPoint1.x * CGFloat(width))
            let rowPosition1 = Int(handVisionPoint1.y * CGFloat(height))
            
            let colPosition2 = Int(handVisionPoint2.x * CGFloat(width))
            let rowPosition2 = Int(handVisionPoint2.y * CGFloat(height))

            
            guard CVPixelBufferGetPixelFormatType(depthPixelBuffer) == kCVPixelFormatType_DepthFloat32 else { return }

                    CVPixelBufferLockBaseAddress(depthPixelBuffer, .readOnly)

                    if let baseAddress = CVPixelBufferGetBaseAddress(depthPixelBuffer) {

                            let width = CVPixelBufferGetWidth(depthPixelBuffer)

                            let index1 = colPosition1 + (rowPosition1 * width)
                            let index2 = colPosition2 + (rowPosition1 * width)

                            let offset1 = index1 * MemoryLayout<Float>.stride
                            let offset2 = index2 * MemoryLayout<Float>.stride

                            let distanceValue1 = baseAddress.load(fromByteOffset: offset1, as: Float.self)
                            let distanceValue2 = baseAddress.load(fromByteOffset: offset2, as: Float.self)
                        
                        CVPixelBufferUnlockBaseAddress(depthPixelBuffer, .readOnly

                    }
            
            CVPixelBufferUnlockBaseAddress(depthPixelBuffer, .readOnly)

            
        }

Ios相关问答推荐

设置堆栈视图的所有属性

如何创建自定义组件来接受(和传递)所有可能的Textfield参数?

Xcode版本15.2上未显示iOS 16.4的模拟器

SWIFT AVFoundation翻转相机功能不起作用

如何防止UITest套件与Fastlane一起执行?

Swift-如何通过Case Let访问模型变量

从包含 5 个以上项目的自定义选项卡栏的导航视图中删除更多按钮

swiftui动画如何实时确定不对称过渡

NumberFormatter 无法使用 Xcode 15.0 beta 正确识别 iOS 17 beta 中的 Locale 货币设置

Xcode 不支持 iOS 15.6

迁移到 UIKit 生命周期的应用程序不会调用 SceneDelegate

SensorKit - 获取数据不调用结果委托函数

NSURLConnection 的 Xcode 4 警告未使用表达式结果

如何使用 Swift 从assets资源 中加载特定图像

在 Xcode 5 中为超级视图添加间距约束

iOS:设备旋转后如何运行函数(Swift)

使用 swift 将本地 html 加载到 UIWebView

iOS 7 及更高版本: for each 视图控制器设置状态栏样式

如何删除 Swift 数组中的所有 nil 元素?

从故事板导航到其他 Swift 1.2 文件时 Xcode 6.3 崩溃