Contents

GSoC 2022 :: Weekly Update: Week 6


This post will briefly cover:

  • Learnings
  • Tasks done, and those in progress
  • Helpful resources

For the project proposal, visit here.


Create Recipe for python based Vosk websocket server:

GitHub Repo: https://github.com/alphacep/vosk-server/tree/master/websocket

Recipe created: python3-vosk-websocket-server_got.bb

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
DESCRIPTION = "WebSocket, gRPC and WebRTC speech recognition server based on Vosk and Kaldi libraries"
SUMMARY = "This is a server for highly accurate offline speech recognition using Kaldi and Vosk-API."
HOMEPAGE = "https://github.com/alphacep/vosk-server"
LICENSE = "Apache-2.0"
LIC_FILES_CHKSUM = "file://COPYING;md5=d09bbd7a3746b6052fbd78b26a87396b"

SRC_URI = "git://github.com/alphacep/vosk-server;protocol=https;branch=master"

PV = "1.0+git${SRCPV}"
SRCREV = "70f3d5321a40f2f5dffe9c833bc1fac4b3b451e7"

S = "${WORKDIR}/git"

RDEPENDS:${PN} += " \
    vosk \
    python3-vosk-api \
    python3-websockets \
    virtual/vosk-model \
"

do_configure () {
	:
}

do_compile () {
	:
}

do_install () {
	install -d ${D}${bindir}
	cp ${S}/websocket/asr_server.py ${D}${bindir}/vosk-websocket-python.py
    chmod a+x ${D}${bindir}/vosk-websocket-python.py
}

ptest enable recipe for vosk-api:

Recipe name: python3-vosk-api_0.3.42.bb

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
SUMMARY = "Offline open source speech recognition API based on Kaldi and Vosk"
HOMEPAGE = "https://github.com/alphacep/vosk-api"
AUTHOR = "aman.arora9848@gmail.com"
LICENSE = "Apache-2.0"
LIC_FILES_CHKSUM = "file://../COPYING;md5=d09bbd7a3746b6052fbd78b26a87396b"

SRC_URI = "git://github.com/alphacep/vosk-api;protocol=https;branch=master \
           file://0001-Change-library-search-path-to-usr-lib.patch \
           file://run-ptest \
           " 

PV = "0.3.42+git${SRCPV}"
SRCREV = "b1b216d4c87d708935f1601287fe502aa11ee4a9"

S = "${WORKDIR}/git/python"

inherit setuptools3 ptest

DEPENDS += " \
    python3-srt-native \
    python3-tqdm-native \
    python3-requests-native \
    python3-pip-native \
    python3-charset-normalizer-native \
"

RDEPENDS:${PN} += " \
    python3-cffi \
    python3-compression \
    python3-core \
    python3-datetime \
    python3-json \
    python3-logging \
    python3-misc \
    python3-multiprocessing \
    python3-netclient \
    python3-requests \
    python3-tqdm \
    python3-srt \
    python3-charset-normalizer \
    python3-sounddevice \
    virtual/vosk-model \
"

do_install_ptest () {
    install -d ${D}${PTEST_PATH}/tests/
    cp ${S}/example/test_simple.py ${D}${PTEST_PATH}/tests/
    cp ${S}/example/test.wav ${D}${PTEST_PATH}/tests/
}

Test the Vosk API on the AGL demo platform:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
root@qemux86-64:~# ptest-runner python3-vosk-api
START: ptest-runner     
2022-07-26T09:50 
BEGIN: /usr/lib/python3-vosk-api/ptest
LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=10 max-active=3000 lattice-beam=2
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10 
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components. 
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from /usr/share/vosk/vosk-model-small-en-us-0.15/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor 
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:282) Loading HCL and G from /usr/share/vosk/vosk-model-small-en-us-0.15/graph/HCLr.fst /usr/share/vosk/vosk-model-small-
en-us-0.15/graph/Gr.fst
LOG (VoskAPI:ReadDataFiles():model.cc:303) Loading winfo /usr/share/vosk/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int
{ 
  "partial" : ""
}

...(ouput ommitted)

{
  "result" : [{
      "conf" : 1.000000,
      "end" : 6.690000,
      "start" : 6.240000,
      "word" : "zero"
    }, {
      "conf" : 1.000000,
      "end" : 6.900000,
      "start" : 6.690000,
      "word" : "one"
    }, {
      "conf" : 1.000000,
      "end" : 7.110000,
      "start" : 6.930000,
      "word" : "eight"
    }, {
      "conf" : 1.000000,
      "end" : 7.500000,
      "start" : 7.110000,
      "word" : "zero"
    }, {
      "conf" : 1.000000,
      "end" : 7.980000,
      "start" : 7.500000,
      "word" : "three"
    }],
  "text" : "zero one eight zero three"
}
DURATION: 3
END: /usr/lib/python3-vosk-api/ptest
2022-07-26T09:50
STOP: ptest-runner
TOTAL: 1 FAIL: 0

Add below 2 lines to local.conf.inc in the feature template agl-offline-voice-agent to enable ptest and add ptest packages.

1
2
DISTRO_FEATURES:append = " ptest"
EXTRA_IMAGE_FEATURES += "ptest-pkgs"

Append the ptest-runner package to IMAGE_INSTALL:append.

AGL server demo:

Tested on personal machine with QEMU and the AGL demo platform image: agl-demo-platform-qemux86-64.wic.vmdk.

1
2
3
4
5
6
7
$ export OVMF_PATH=/usr/share/ovmf/OVMF.fd
$ qemu-system-x86_64 -enable-kvm -m 2048 -bios ${OVMF_PATH} -hda agl-demo-platform-qemux86-64.wic.vmdk -cpu kvm64 -cpu qemu64,+ssse3,+sse4.1,+sse4.2,+popcnt     -vga virtio -show-cursor     -device virtio-rng-pci     -serial mon:stdio -serial null     -soundhw hda     -net nic     -net user,hostfwd=tcp::2222-:22

Automotive Grade Linux 13.90.0+snapshot-a5ea426b1da472fc8549459fff3c1b8c6e02f4b5 qemux86-64 ttyS0

qemux86-64 login: root
root@qemux86-64:~# 

In another terminal, in a preferred directory, clone vosk-server git repo and copy required files to our image.

1
2
3
4
$ git clone https://github.com/alphacep/vosk-server.git
$ cd vosk-server/websocket
$ scp -P 2222 test.py root@localhost:~
$ scp -P 2222 test16k.wav root@localhost:~

In the Image:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
$ ./vosk-websocket-python.py /usr/share/vosk/vosk-model-small-en-us-0.15/ &
[1] 427
root@qemux86-64:/usr/bin# LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=10 max-active=3000 lattice-beam=2
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from /usr/share/vosk/vosk-model-small-en-us-0.15//ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:282) Loading HCL and G from /usr/share/vosk/vosk-model-small-en-us-0.15//graph/HCLr.fst /usr/share/vosk/vosk-model-small-en-us-0.15//graph/Gr.fst
LOG (VoskAPI:ReadDataFiles():model.cc:303) Loading winfo /usr/share/vosk/vosk-model-small-en-us-0.15//graph/phones/word_boundary.int
INFO:websockets.server:server listening on 0.0.0.0:2700

$ cd
$ ./test.py test16k.wav 
INFO:websockets.server:connection open
INFO:root:Connection from ('127.0.0.1', 53596)
INFO:root:Config {'sample_rate': 16000}
{
  "partial" : ""
}
{
  "partial" : "one"
}
{
  "partial" : "one zero"
}
{
  "partial" : "one zero"
}
{
  "partial" : "one zero zero"
}
{
  "partial" : "one zero zero"
}
{
  "partial" : "one zero zero zero"
}
{
  "partial" : "one zero zero zero one"
}
{
  "partial" : "one zero zero zero one"
}

{
  "result" : [{
      "conf" : 1.000000,
      "end" : 1.110000,
      "start" : 0.840000,
      "word" : "one"
    }, {
      "conf" : 1.000000,
      "end" : 1.530000,
      "start" : 1.110000,
      "word" : "zero"
    }, {
      "conf" : 1.000000,
      "end" : 1.920000,
      "start" : 1.530000,
      "word" : "zero"
    }, {
      "conf" : 1.000000,
      "end" : 2.310000,
      "start" : 1.920000,
      "word" : "zero"
    }, {
      "conf" : 1.000000,
      "end" : 2.610000,
      "start" : 2.310000,
      "word" : "one"
    }],
  "text" : "one zero zero zero one"
}
{
  "partial" : ""
}
{
  "partial" : ""
}
{
  "partial" : "nah no"
}
{
  "partial" : "nah no to i know"
}
{
  "partial" : "nah no to i know"
}
{
  "result" : [{
      "conf" : 0.559711,
      "end" : 4.110000,
      "start" : 3.930000,
      "word" : "nah"
    }, {
      "conf" : 0.616727,
      "end" : 4.290000,
      "start" : 4.110000,
      "word" : "no"
    }, {
      "conf" : 0.694549,
      "end" : 4.560000,
      "start" : 4.290000,
      "word" : "to"
    }, {
      "conf" : 0.496237,
      "end" : 4.620000,
      "start" : 4.560000,
      "word" : "i"
    }, {
      "conf" : 0.785862,
      "end" : 4.980000,
      "start" : 4.620000,
      "word" : "know"
    }],
  "text" : "nah no to i know"
}
{
  "partial" : ""
}

{
  "partial" : "zero"
}
{
  "partial" : "zero one"
}
{
  "partial" : "zero one eight zero"
}
{
  "partial" : "zero one eight zero three"
}
{
  "partial" : "zero one eight zero three"
}
{
  "result" : [{
      "conf" : 1.000000,
      "end" : 6.690000,
      "start" : 6.240000,
      "word" : "zero"
    }, {
      "conf" : 1.000000,
      "end" : 6.900000,
      "start" : 6.690000,
      "word" : "one"
    }, {
      "conf" : 1.000000,
      "end" : 7.110000,
      "start" : 6.930000,
      "word" : "eight"
    }, {
      "conf" : 1.000000,
      "end" : 7.500000,
      "start" : 7.110000,
      "word" : "zero"
    }, {
      "conf" : 1.000000,
      "end" : 7.980000,
      "start" : 7.500000,
      "word" : "three"
    }],
  "text" : "zero one eight zero three"
}
INFO:websockets.server:connection closed

$ kill $!

$ poweroff -f

Changes submitted to AGL:

WIP and to-do’s:

  • Working on QT based demo application to be used with vosk websocket server

Helpful Resources:

ptest:
For issues / help:
Misc.