天天看點

python手勢視訊識别标記環境

環境

python3.7

  • pytorch1.1.0
  • torchvision 0.3.0
  • cuda 9.0以上

    ##項目架構

  • Audio-and-video-demo
    • bgm (背景語音播封包件)
    • images
      • ffempeg-img
      • rec-img
    • model (自訓練模型儲存)
    • video (輸入輸出視訊檔案)
    • bgm.py
    • combination.py
    • ffempeg-img-recognition.py
    • gesture-recognition.py
    • main.py
    • putlabel.py

子產品

ffempeg-img-recognition.py

    将手勢視訊按幀分解為圖檔并儲存

def ffmpeg_img_extract(videopath):
   		 container = av.open(videopath)
    
   		 stream = container.streams.video[0]
   		 stream.codec_context.skip_frame = 'NONKEY'
    
   		 for frame in container.decode(stream):
       		 #savepath = 'C:/Users/hp/Desktop/Audio_and_video_processing/Audio_and_video_demo/images/ffmpeg_img/' +'%d.jpg'%frame.index
        	 savepath = 'images/ffmpeg_img/' +'%d.jpg'%frame.index
       	     frame.to_image().save(savepath,quality=80)
 
 	def img_to_video(videopath):
    #轉換為每幀

    	container = av.open(videopath)
    
   	    for frame in container.decode(video=0):
        	#savepath = 'C:/Users/hp/Desktop/Audio_and_video_processing/Audio_and_video_demo/images/ffmpeg_img/' +'%d.jpg'%frame.index
        	savepath = 'images/ffmpeg_img/' +'%d.jpg'%frame.index
            frame.to_image().save(savepath)

           

gesture-recognition.py

    利用訓練好的模型對手勢圖像進行識别,并用label_flag矩陣記錄标簽。這裡使用的是googlenet預訓練模型對我們的資料集進行訓練,采用學習率降低法多次疊代訓練,得到的模型對手勢圖像識别正确率在95%以上。

def gesture_recognition(filepath):
    fileList = os.listdir(filepath)
    
    count = 0
    for filename in fileList:
        count += 1
     
    #背景音樂标簽
    bgm_label = []
    for i in range(count):
        
        filename = filepath+str(i)+'.jpg'
        #圖檔讀取
        input_image = Image.open(filename)
          
        
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        
        
        #導入測試圖檔
        input_image = Image.open(filename)
        preprocess = transforms.Compose([
            transforms.Resize(256),
            #transforms.CenterCrop(224),
            transforms.RandomRotation(20),
            #transforms.ColorJitter(contrast=3),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
        input_tensor = preprocess(input_image)
        input_batch = input_tensor.unsqueeze(0) 
        image_tensor = input_batch.to(device)
        
        
        #打開labels
        with open('images/gesture_24.txt', 'r', encoding='gbk') as clf:
            labels = clf.readlines()
            
            
        #導入訓練好的模型
        alexnet = torch.load('model/googlenet_model.pkl')
        alexnet.eval()
        
        start = time()
        with torch.no_grad():
            output = alexnet(image_tensor)
 
        prob = F.softmax(output[0], dim=0)
        indexs = torch.argsort(-prob)

        finish = time()
        print("識别時間:")
        print(finish-start)
        #添加音樂标簽
        bgm_label.append(labels[indexs[0]].strip())

        #對圖檔做标記
        putText(filename,labels[indexs[0]])
        
    return bgm_label
           

#bgm.py

    給視訊按照手勢識别标簽制作添加音頻針對變換手勢音頻快速變換問題,采用一個标記矩陣lable_flag記錄所有标簽中标簽變換的位置資訊,同時len_flag矩陣存儲每一個連續标簽存在的時長,對于小于一定連續幀長度的标簽做一個容錯處理,預設其識别錯誤,用相應時長空白矩陣進行填充,對于大于30幀的連續幀标記,進行一次語音播報剩下時長用等時長的空白音頻填充。

bgm_dict = {'Congratulation':1, 
            'Eight':2,
            'Fist':3,
            'Five':4,
            'Four':5,
            'Heart_1':6, 
            'Heart_2':7,
            'Heart_3':8, 
            'Heart_single':9, 
            'Honour':10,
            'ILY':11,
            'Insult':12,
            'Nine':13,
            'OK':14,
            'One':15,
            'Palm_up':16,
            'Prayer':17,
            'Rock':18,
            'Seven':19,
            'Six':20, 
            'Three':21, 
            'Thumb_down':22, 
            'Thumb_up':23, 
            'Two':24  }
 
 	        
	def add_bgm3(bgm_label):
  
  	    count = len(bgm_label)
    	#print(count)
    
  		  #标記标簽變化位置
  	   label_flag = [0]
 	   label = bgm_label[0]
  	   for i in range(count):
            if bgm_label[i]!=label:
           	    label_flag.append(i)
                label = bgm_label[i]
                label_flag.append(len(bgm_label)-1)
    
    
    			label_flag_number = len(label_flag)
     
    
   			    music = AudioSegment.from_wav('bgm/1.wav')
                clip = music[:0.0001*1000]
    
    
            for i in range(label_flag_number-1):
                #相應幀數對應标簽
                flag = label_flag[i]
                label = bgm_label[flag]

                #标簽對應的音頻序号
                number = bgm_dict[label]
                number = int(number)
                #print(number)
    
                #去除手勢變換識别錯誤标簽
               if_or_not = int(label_flag[i+1]-label_flag[i])

        
               if if_or_not < 12:
                   start = float(label_flag[i]*0.033)
                   end = float(label_flag[i+1]*0.033)
            
                   bgm_len = float(end-start)
                   #print(bgm_len)
            
                   bgm_path = 'bgm/0.wav'
            
                   music = AudioSegment.from_wav(bgm_path)
         
                   clip = clip + music[:bgm_len*1000]
            
               elif if_or_not > 30:
                   start = float(label_flag[i]*0.033)
                   end = float(label_flag[i+1]*0.033)
            
                   bgm_len = float(end-start)
                   #print(bgm_len)
            
                   bgm_path = "bgm/%d"%number + ".wav"
            
                   music = AudioSegment.from_wav(bgm_path)
        
                   clip = clip + music[:30*0.033*1000]
            
                   bgm_path = 'bgm/0.wav'
            
                   music = AudioSegment.from_wav(bgm_path)
         
                   clip = clip + music[:(if_or_not-30)*0.033*1000]
            
               else:              
                   start = float(label_flag[i]*0.033)
                   end = float(label_flag[i+1]*0.033)
            
                   bgm_len = float(end-start)
                   #print(bgm_len)
            
                   bgm_path = "bgm/%d"%number + ".wav"
            
                   music = AudioSegment.from_wav(bgm_path)
             
                   clip = clip + music[:bgm_len*1000]
      
        clip.export('bgm/clip.wav', format='wav')

           

    将對應音頻添加到合成好的視訊上

def video_merge2(outpath):
    
     bgm_path = "bgm/clip.wav"
     #print(bgm_path)
     # 讀取音頻
     audio = AudioFileClip(bgm_path)
       
     video = VideoFileClip('video/saveVideo.mp4')
     
     # 設定視訊的音頻
     video = video.set_audio(audio)
     
     video.write_videofile(outpath)
           

###combination.py

    将識别完并打上标簽的手勢圖檔合成為視訊

def combination(length):
    
    img = cv2.imread("images/rec_image/0.jpg")
    w, h ,c = img.shape
    
    #print(w,h,c)
    img_root = "images/rec_image/"
    #path=".\\"
    filelist=os.listdir()
    fps = 30
    
    file_path='video/saveVideo.mp4' # 導出路徑DIVX/mp4v
    size = (h, w)
   
    fourcc = cv2.VideoWriter_fourcc(*'mp4v') # mp4
    
    videoWriter = cv2.VideoWriter(file_path,fourcc,fps,size)
    
    # 這種情況更适合于照片是從"1.jpg" 開始,然後每張圖檔名字+1的那種
    for i in range(length):
        frame = cv2.imread(img_root+str(i)+'.jpg')
        videoWriter.write(frame)
    
    
    videoWriter.release() #釋放
           

putlabel.py

    對視訊分解為幀的圖檔進行手勢識别并貼上标簽

def putText(image,label):
    
    print(image)
    flag = image.rfind("/")
    imagename = image[flag+1:]
    imagename = str(imagename)
    #savepath = 'C:/Users/hp/Desktop/Audio_and_video_processing/Audio_and_video_demo/images/rec_image/'+imagename
    savepath = 'images/rec_image/'+imagename

    print(savepath)
    
    label = label.strip()
    #cv2.namedWindow("mark", cv2.WINDOW_AUTOSIZE)
    image = cv2.imread(image)
    image = cv2.putText(image, label, (100, 100), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 0, 0), 2)
    cv2.imwrite(savepath,image)
           

###main.py

    主函數

def main():

    #videopath = 'video/test3.mp4'
    videopath = sys.argv[1]
    videopath = 'video/'+str(videopath)
    
    outpath = sys.argv[2]
    outpath = 'video/'+str(outpath)
    #ffmpeg_img_extract(videopath)
    img_to_video(videopath)
    #輸入需要讀取圖檔目錄
    filepath = 'images/ffmpeg_img/'
    
    #識别圖像手勢内容并标注儲存
    bgm_label = gesture_recognition(filepath)
    #print(bgm_label)
    
    
    #圖像編碼
    combination(len(bgm_label))
    
    #添加bgm
    add_bgm3(bgm_label)   
    video_merge2(outpath)
    
           

使用

指令行使用

example:
python main.py test.mp4(輸入) out.mp4(輸出)
           
giteel連結:https://gitee.com/ceasarxo/gesture-recognition

繼續閱讀