当前位置:首页 » 《我的小黑屋》 » 正文

Unity对接科大讯飞实时语音转写WebAPI(Windows平台)(一)

3 人参与  2024年09月15日 09:21  分类 : 《我的小黑屋》  评论

点击全文阅读


科大讯飞官方文档:实时语音转写 API 文档 | 讯飞开放平台文档中心 (xfyun.cn)

参考文章:unity通过WebAPI连接Websocket实现讯飞语音识别与合成。_unity websocket audio-CSDN博客

        要实现语音转文字。首先我们需要从麦克风获取到语音数据,这里用到了Microphone类,Unity自带;其次,需要将语音数据发送给讯飞,这里用到的是WebSocketSharp.WebSocket,用习惯了。然后就是按照文档一步步踩坑了。

        直接贴代码了。代码主要实现握手阶段参数签名,实时通信阶段的数据传输以及结果解析。

using System.Collections;using System.Collections.Generic;using UnityEngine;using System;using WebSocketSharp;using System.Text;using System.Security.Cryptography;using LitJson;using Newtonsoft.Json;public class SpeechHelper : MonoBehaviour{    public event Action<string> 语音识别完成事件;   //语音识别回调事件    public AudioClip RecordedClip;    private string micphoneName = string.Empty;    WebSocket speechWebSocket;    private System.Action<string> resultCallback;    public void InitSpeechHelper(System.Action<string> textCallback)    {        resultCallback = textCallback;    }    public void StartSpeech()    {        if (speechWebSocket != null && speechWebSocket.ReadyState == WebSocketState.Open)        {            Debug.LogWarning("开始语音识别失败!,等待上次识别连接结束");            return;        }        if(Microphone.devices.Length <= 0)        {            Debug.LogWarning("找不到麦克风");            return;        }        messageQueue.Clear();        micphoneName = Microphone.devices[0];        Debug.Log("micphoneName:" + micphoneName);        try        {            RecordedClip = Microphone.Start(micphoneName, false, 60, 16000);            ConnectSpeechWebSocket();        }        catch(Exception ex)        {            Debug.LogError(ex.Message);        }    }    public void StopSpeech()    {        Microphone.End(micphoneName);        Debug.Log("识别结束,停止录音");    }    void ConnectSpeechWebSocket()    {        try        {            speechWebSocket = new WebSocket(GetWebSocketUrl());        }        catch (Exception ex)        {            UnityEngine.Debug.LogError(ex.Message);            return;        }        speechWebSocket.OnOpen += (sender, e) =>        {            Debug.Log("OnOpen");            speechWebSocket.OnClose += OnWebSocketClose;        };        speechWebSocket.OnMessage += OnInitMessage;        speechWebSocket.OnError += OnError;        speechWebSocket.ConnectAsync();        StartCoroutine(SendVoiceData());    }    void OnWebSocketClose(object sender, CloseEventArgs e)    {        Debug.Log("OnWebSocketClose");    }    private static Queue<string> messageQueue = new Queue<string>();    void OnInitMessage(object sender, MessageEventArgs e)    {        UnityEngine.Debug.Log("qqqqqqqqqqqqqWebSocket数据返回:" + e.Data);        messageQueue.Enqueue(e.Data);    }    private void MainThreadOnMessage(string message)    {        try        {            XFResponse response = JsonConvert.DeserializeObject<XFResponse>(message);            if (0 != response.code)            {                                return;            }            if (response.action.Equals("result"))            {                var result = ParseXunfeiRecognitionResult(response.data);                if(result.IsFinal)                {                    Debug.Log("Text最终:" + result.Text);                    resultCallback?.Invoke(result.Text);                }else                {                    Debug.Log("Text中间:" + result.Text);                }            }        }        catch (Exception ex)        {            Debug.LogError(ex.Message);        }    }    void OnError(object sender, ErrorEventArgs e)    {        UnityEngine.Debug.Log("WebSoclet:发生错误:" + e.Message);    }    public SpeechRecognitionResult ParseXunfeiRecognitionResult(string dataJson)    {        StringBuilder builder = new StringBuilder();        SpeechRecognitionResult res = new SpeechRecognitionResult();        try        {            JsonData data = JsonMapper.ToObject(dataJson);            JsonData cn = data["cn"];            JsonData st = cn["st"];            if (st["ed"].ToString().Equals("0"))            {                res.IsFinal = false;            }            else            {                res.IsFinal = true;            }            JsonData rtArry = st["rt"];            foreach (JsonData rtObject in rtArry)            {                JsonData wsArr = rtObject["ws"];                foreach (JsonData wsObject in wsArr)                {                    JsonData cwArr = wsObject["cw"];                    foreach (JsonData cwObject in cwArr)                    {                        builder.Append(cwObject["w"].ToString());                    }                }            }        }catch(Exception ex)        {            Debug.LogError(ex.Message);        }        res.Text = builder.ToString();        return res;    }    void SendData(byte[] voiceData)    {        Debug.Log("SendData:" + voiceData.Length + ",time:" + Time.realtimeSinceStartup);        if (speechWebSocket.ReadyState != WebSocketState.Open)        {            return;        }        try        {            if (speechWebSocket != null && speechWebSocket.IsAlive)            {                speechWebSocket.SendAsync(voiceData, success =>                {                    if (success)                    {                        UnityEngine.Debug.Log("WebSoclet:发送成功:" + voiceData.Length);                    }                    else                    {                        UnityEngine.Debug.Log("WebSoclet:发送失败:");                    }                });            }        }        catch        {        }    }    void SendEndMsg(System.Action callback)    {        string endMsg = "{\"end\": true}";        byte[] data = Encoding.UTF8.GetBytes(endMsg);        try        {            if (speechWebSocket != null && speechWebSocket.IsAlive)            {                speechWebSocket.SendAsync(data, success =>                {                    if (success)                    {                        UnityEngine.Debug.Log("WebSoclet:发送END成功:" + data.Length);                    }                    else                    {                        UnityEngine.Debug.Log("WebSoclet:发送END失败:");                    }                    callback?.Invoke();                });            }        }        catch        {        }    }    IEnumerator SendVoiceData()    {        yield return new WaitUntil(()=> (speechWebSocket.ReadyState == WebSocketState.Open));        yield return new WaitWhile(() => Microphone.GetPosition(micphoneName) <= 0);        float t = 0;        int position = Microphone.GetPosition(micphoneName);        const float waitTime = 0.04f;//每隔40ms发送音频        int lastPosition = 0;        const int Maxlength = 640;//最大发送长度        //Debug.Log("position:" + position + ",samples:" + RecordedClip.samples);        while (position < RecordedClip.samples && speechWebSocket.ReadyState == WebSocketState.Open)        {            t += waitTime;            yield return new WaitForSecondsRealtime(waitTime);            if (Microphone.IsRecording(micphoneName)) position = Microphone.GetPosition(micphoneName);            //Debug.Log("录音时长:" + t + "position=" + position + ",lastPosition=" + lastPosition);            if (position <= lastPosition)            {                Debug.LogWarning("字节流发送完毕!强制结束!");                break;            }            int length = position - lastPosition > Maxlength ? Maxlength : position - lastPosition;            byte[] date = GetClipData(lastPosition, length, RecordedClip);            SendData(date);            lastPosition = lastPosition + length;        }        yield return new WaitForSecondsRealtime(waitTime);        SendEndMsg(null);        Microphone.End(micphoneName);    }    public byte[] GetClipData(int star, int length, AudioClip recordedClip)    {        float[] soundata = new float[length];        recordedClip.GetData(soundata, star);        int rescaleFactor = 32767;        byte[] outData = new byte[soundata.Length * 2];        for (int i = 0; i < soundata.Length; i++)        {            short temshort = (short)(soundata[i] * rescaleFactor);            byte[] temdata = BitConverter.GetBytes(temshort);            outData[i * 2] = temdata[0];            outData[i * 2 + 1] = temdata[1];        }        return outData;    }    private string GetWebSocketUrl()    {        string appid = "appid";        string ts = GetCurrentUnixTimestampMillis().ToString();        string baseString = appid + ts;        string md5 = GetMD5Hash(baseString);        UnityEngine.Debug.Log("baseString:" + baseString + ",md5:" + md5);        string sha1 = CalculateHmacSha1(md5, "appkey");        string signa = sha1;        string url = string.Format("ws://rtasr.xfyun.cn/v1/ws?appid={0}&ts={1}&signa={2}", appid, ts, signa);        UnityEngine.Debug.Log(url);        return url;    }    private long GetCurrentUnixTimestampMillis()    {        DateTime unixStartTime = new DateTime(1970, 1, 1).ToLocalTime();        DateTime now = DateTime.Now;// DateTime.UtcNow;        TimeSpan timeSpan = now - unixStartTime;        long timestamp = (long)timeSpan.TotalSeconds;        return timestamp;    }    public string GetMD5Hash(string input)    {        MD5 md5Hasher = MD5.Create();        byte[] data = md5Hasher.ComputeHash(Encoding.Default.GetBytes(input));        StringBuilder sBuilder = new StringBuilder();        for (int i = 0; i < data.Length; i++)        {            sBuilder.Append(data[i].ToString("x2"));        }        return sBuilder.ToString();    }    public string CalculateHmacSha1(string data, string key)    {        HMACSHA1 hmac = new HMACSHA1(Encoding.UTF8.GetBytes(key));        byte[] hashBytes = hmac.ComputeHash(Encoding.UTF8.GetBytes(data));        return Convert.ToBase64String(hashBytes);    }    private void Update()    {        if(messageQueue.Count > 0)        {            MainThreadOnMessage(messageQueue.Dequeue());        }    }}

Json解析类。

[Serializable]public struct XFResponse{    public string action;    public int code;    public string data;    public string desc;    public string sid;}[Serializable]public struct SpeechRecognitionResult{    public string Text;            public bool IsFinal;        }

值得注意的问题。

1、Microphone使用时传默认设备名比传null好使

2、握手阶段时间戳用的是秒(不是毫秒)

3、上传结束标志时,也要间隔40ms,否则讯飞像是没收到一样

4、如果Microphone.devices的长度为0,电脑确实又有麦克风设备,那么可能是麦克风的名字是中文导致的

遗留问题:

yield return new WaitForSecondsRealtime(0.04f)实际间隔时间0.1s左右,导致消息发送得很慢,语音识别慢。

2024.5.24更新第二篇,有效解决消息发送慢,识别慢的问题

2024.6.19更新:取消协程中发送数据,直接在Update中发送。解决消息发送很慢问题

    private void Update()    {        if (isRunning)        {            byte[] voiceData = GetVoiveData();            if (voiceData != null)            {                SendData(voiceData);            }        }        if (messageQueue.Count > 0)        {            MainThreadOnMessage(messageQueue.Dequeue());        }    }private int last_length = -1;    private float[] volumeData = new float[9999];    private short[] intData = new short[9999];    bool isRunning;    private byte[] GetVoiveData()    {        if (RecordedClip == null)        {            return null;        }        int new_length = Microphone.GetPosition(null);        if (new_length == last_length)        {            if (Microphone.devices.Length == 0)            {                isRunning = false;            }            return null;        }        int length = new_length - last_length;        int offset = last_length + 1;        last_length = new_length;        if (offset < 0)        {            return null;        }        if (length < 0)        {            float[] temp = new float[RecordedClip.samples];            RecordedClip.GetData(temp, 0);            int lengthTail = RecordedClip.samples - offset;            int lengthHead = new_length + 1;            try            {                Array.Copy(temp, offset, volumeData, 0, lengthTail);                Array.Copy(temp, 0, volumeData, lengthTail + 1, lengthHead);                length = lengthTail + lengthHead;            }            catch (Exception)            {                return null;            }        }        else        {            if (length > volumeData.Length)            {                volumeData = new float[length];                intData = new short[length];            }            RecordedClip.GetData(volumeData, offset);        }        byte[] bytesData = new byte[length * 2];        int rescaleFactor = 32767; //to convert float to Int16        for (int i = 0; i < length; i++)        {            intData[i] = (short)(volumeData[i] * rescaleFactor);            byte[] byteArr = BitConverter.GetBytes(intData[i]);            byteArr.CopyTo(bytesData, i * 2);        }        return bytesData;    }

2024.8更新 尽管PC上的麦克风已经改成英文或者数字,依然容易出现找不到麦克风的问题。该问题通过使用NAudio.dll解决,详情看下一篇


点击全文阅读


本文链接:http://zhangshiyu.com/post/160137.html

<< 上一篇 下一篇 >>

  • 评论(0)
  • 赞助本站

◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。

关于我们 | 我要投稿 | 免责申明

Copyright © 2020-2022 ZhangShiYu.com Rights Reserved.豫ICP备2022013469号-1