Deep Visual Semantic Embedding with Text Data Augmentation and Word Embedding Initialization

<table class="table-group" id="tab1"><tr><td><table class="table"><tr><td class="thead-hr" colspan="2"><hr/></td></tr><tr class="thead"><td class="align_left">Notation</td><td class="align_center">Description</td></tr><tr><td class="thead-hr" colspan="2"><hr/></td></tr><tr><td class="align_left"><i>l</i></td><td class="align_center">The length of sentence</td></tr><tr><td class="align_left"><i>n</i></td><td class="align_center">Number of times doing augmentation operation</td></tr><tr><td class="align_left"><i>p</i></td><td class="align_center">The probability to remove every word in the sentence</td></tr><tr><td class="align_left"><span style="width: 7.51131ptpx;"><svg height="6.1673pt" id="M1" style="vertical-align:-0.2063904pt" version="1.1" viewbox="-0.0498162 -5.96091 7.51131 6.1673" width="7.51131pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M545 106L524 126C493 85 467 65 455 65C438 65 427 113 405 238C448 295 498 362 543 439L533 448L478 435C453 386 423 331 398 295H395C370 404 347 448 282 448C169 448 23 309 23 153C23 54 65 -12 128 -12C203 -12 283 70 339 155H341C360 29 380 -12 411 -12C444 -12 491 11 545 106ZM333 204C265 95 210 54 169 54C137 54 113 96 113 171C113 302 191 405 252 405C301 405 318 306 333 204Z"></path></g></svg></span></td><td class="align_center">The percent of words to be changed in the sentence</td></tr><tr><td class="align_left"><span style="width: 31.2374ptpx;"><svg height="14.0004pt" id="M2" style="vertical-align:-5.3645pt" version="1.1" viewbox="-0.0498162 -8.6359 31.2374 14.0004" width="31.2374pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M541 160L512 170C485 116 462 88 439 67C411 41 376 35 318 35C272 35 238 37 224 48C207 61 205 85 212 121L290 533C305 611 308 615 391 622L397 650H139L133 622C217 615 221 612 206 533L126 118C111 41 103 34 23 26L17 0H474C489 31 528 124 541 160Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,8.294,3.132)"><path d="M298 36L289 62C276 55 253 45 228 45C202 45 169 60 169 141V397H276C289 405 292 426 282 437H169V574L155 576L90 509V437H45L17 408L21 397H90V107C90 28 125 -12 188 -12C198 -12 213 -8 230 1L298 36Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,11.115,3.132)"><path d="M181 342V451C133 431 89 419 40 411V388C98 381 102 377 102 311V104C102 38 95 32 33 26V0H263V26C186 32 181 38 181 104V287C203 343 235 372 261 372C277 372 289 366 304 352C310 346 318 345 330 350C349 359 362 379 362 399C362 422 338 449 304 449C256 449 213 393 183 342H181Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,14.546,3.132)"><path d="M135 536C164 536 186 560 186 587C186 617 164 639 136 639C109 639 85 617 85 587C85 560 109 536 135 536ZM252 0V26C188 32 181 38 181 106V451C138 433 90 420 39 412V388C99 379 102 374 102 312V106C102 38 95 32 32 26V0H252Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,16.875,3.132)"><path d="M169 380V459C122 440 66 423 24 416V392C86 384 90 382 90 317V-135C90 -201 81 -207 17 -213V-240H253V-213C176 -207 169 -201 169 -125V6C182 -1 208 -11 238 -12C368 12 487 109 487 260C487 358 421 449 310 449C298 449 279 444 261 433L169 380ZM169 346C196 367 237 389 269 389C341 389 403 329 403 221C403 109 347 37 263 37C228 37 191 53 169 76V346Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,21.58,3.132)"><path d="M238 0V26C174 32 166 38 166 104V712C132 700 70 683 18 677V653C81 647 87 645 87 577V104C87 38 78 32 15 26V0H238Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,23.882,3.132)"><path d="M380 106C343 72 306 56 265 56C195 56 116 112 115 248C235 252 361 262 377 265C396 269 400 277 400 297C400 374 333 449 250 449H249C198 449 144 421 103 376S37 269 37 201C37 88 109 -12 232 -12C263 -12 332 6 395 84L380 106ZM225 412C281 412 315 364 314 312C314 297 308 292 290 292C232 290 176 289 120 289C135 370 180 412 225 412Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,27.749,3.132)"><path d="M298 36L289 62C276 55 253 45 228 45C202 45 169 60 169 141V397H276C289 405 292 426 282 437H169V574L155 576L90 509V437H45L17 408L21 397H90V107C90 28 125 -12 188 -12C198 -12 213 -8 230 1L298 36Z"></path></g></svg></span></td><td class="align_center">The triplet loss</td></tr><tr><td class="align_left"><span style="width: 32.0869ptpx;"><svg height="11.9087pt" id="M3" style="vertical-align:-3.2728pt" version="1.1" viewbox="-0.0498162 -8.6359 32.0869 11.9087" width="32.0869pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M541 160L512 170C485 116 462 88 439 67C411 41 376 35 318 35C272 35 238 37 224 48C207 61 205 85 212 121L290 533C305 611 308 615 391 622L397 650H139L133 622C217 615 221 612 206 533L126 118C111 41 103 34 23 26L17 0H474C489 31 528 124 541 160Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,8.294,3.132)"><path d="M797 0V26C739 32 732 36 732 103V296C732 394 682 449 605 449C576 449 550 437 529 423C504 407 475 389 446 366C425 418 382 449 334 449C303 449 279 437 253 421C222 403 201 385 180 371V452C135 432 85 419 41 411V388C99 379 102 374 102 310V103C102 38 93 32 27 26V0H238V26C189 32 180 38 180 103V338C210 363 250 390 289 390C351 390 377 348 377 275V103C377 37 368 32 306 26V0H520V26C465 32 456 38 456 101V296C456 314 455 326 453 338C491 369 529 390 565 390C628 390 653 345 653 274V107C653 36 642 32 583 26V0H797Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,15.71,3.132)"><path d="M257 449C165 449 37 374 37 209C37 98 119 -12 256 -12C355 -12 473 65 473 226C473 349 381 449 257 449ZM244 416C333 416 380 320 380 204C380 67 329 21 267 21C184 21 130 115 130 241C130 354 184 416 244 416Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,20.434,3.132)"><path d="M517 51L485 54C448 58 441 63 441 115V712C404 700 337 684 285 678V653C357 648 362 645 362 580V437C339 446 309 449 295 449C159 449 38 340 38 201C38 61 143 -12 223 -12C234 -12 261 -6 301 17L362 53V-12C420 9 495 22 517 26V51ZM362 85C338 67 301 51 266 51C201 51 128 109 128 228C128 373 212 411 259 411C296 411 338 395 362 360V85Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,25.238,3.132)"><path d="M380 106C343 72 306 56 265 56C195 56 116 112 115 248C235 252 361 262 377 265C396 269 400 277 400 297C400 374 333 449 250 449H249C198 449 144 421 103 376S37 269 37 201C37 88 109 -12 232 -12C263 -12 332 6 395 84L380 106ZM225 412C281 412 315 364 314 312C314 297 308 292 290 292C232 290 176 289 120 289C135 370 180 412 225 412Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,29.069,3.132)"><path d="M238 0V26C174 32 166 38 166 104V712C132 700 70 683 18 677V653C81 647 87 645 87 577V104C87 38 78 32 15 26V0H238Z"></path></g></svg></span></td><td class="align_center">The loss of proposed model</td></tr><tr><td class="align_left"><span style="width: 17.5702ptpx;"><svg height="14.0004pt" id="M4" style="vertical-align:-5.3645pt" version="1.1" viewbox="-0.0498162 -8.6359 17.5702 14.0004" width="17.5702pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M449 634C442 637 425 643 405 650C376 660 341 666 307 666C181 666 98 590 98 485C98 400 170 343 215 310L246 288C307 243 343 204 343 147C343 67 291 18 219 18C104 18 61 124 51 202L23 199C28 124 27 71 27 47C47 22 122 -16 204 -16C324 -16 428 60 428 174C428 256 379 309 307 360L276 382C223 419 179 455 179 516C179 576 221 632 293 632C379 632 410 564 418 487L448 490C446 536 446 592 449 634Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,6.071,3.132)"><path d="M433 39L423 65C413 59 399 54 387 54C370 54 352 69 352 114V299C352 352 342 392 307 422C285 440 255 449 225 449C168 437 102 399 75 379C56 365 44 353 44 339C44 315 69 296 87 296C101 296 111 303 116 319C124 349 133 371 145 385C156 397 171 404 190 404C241 404 275 364 275 291V274C253 256 180 229 120 209C65 190 39 159 39 110C39 47 88 -12 159 -12C189 -12 237 25 277 52C282 35 288 21 301 8C312 -3 333 -12 348 -12L433 39ZM275 84C256 65 221 48 195 48C164 48 124 73 124 124C124 161 146 180 185 198C206 208 254 229 275 240V84Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,10.066,3.132)"><path d="M95 130C70 130 46 113 46 88C46 72 54 64 59 64C93 55 121 33 121 -3C121 -41 93 -68 44 -88L55 -117C117 -98 186 -56 186 22C186 91 131 130 95 130Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,12.141,3.132)"><path d="M169 380V459C122 440 66 423 24 416V392C86 384 90 382 90 317V-135C90 -201 81 -207 17 -213V-240H253V-213C176 -207 169 -201 169 -125V6C182 -1 208 -11 238 -12C368 12 487 109 487 260C487 358 421 449 310 449C298 449 279 444 261 433L169 380ZM169 346C196 367 237 389 269 389C341 389 403 329 403 221C403 109 347 37 263 37C228 37 191 53 169 76V346Z"></path></g></svg></span></td><td class="align_center">The similarity of anchor <i>x</i><sub>a</sub> and positive input <i>x</i><sub>p</sub></td></tr><tr><td class="align_left"><i>x</i><sub>a</sub></td><td class="align_center">Anchor input</td></tr><tr><td class="align_left"><i>x</i><sub>p</sub></td><td class="align_center">Positive input</td></tr><tr><td class="align_left"><i>x</i><sub>n</sub></td><td class="align_center">Negative input</td></tr><tr><td class="align_left"><span style="width: 7.30254ptpx;"><svg height="9.49473pt" id="M5" style="vertical-align:-0.2063999pt" version="1.1" viewbox="-0.0498162 -9.28833 7.30254 9.49473" width="7.30254pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M529 97L508 118C475 75 449 58 438 58C428 58 421 66 415 104C393 234 374 403 364 496C345 670 307 712 254 712C220 712 174 691 153 669L161 645C176 653 194 658 206 658C237 658 261 640 278 562C287 522 290 483 293 434C223 269 110 105 23 9L32 -12C59 -6 85 0 108 7C152 64 251 252 300 366C307 297 315 221 337 82C346 24 363 -12 393 -12C425 -12 475 13 529 97Z"></path></g></svg></span></td><td class="align_center">The margin that let the negative pairs away from each other</td></tr><tr><td class="align_left"><span style="width: 13.9531ptpx;"><svg height="12.8907pt" id="M6" style="vertical-align:-4.254801pt" version="1.1" viewbox="-0.0498162 -8.6359 13.9531 12.8907" width="13.9531pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M449 634C442 637 425 643 405 650C376 660 341 666 307 666C181 666 98 590 98 485C98 400 170 343 215 310L246 288C307 243 343 204 343 147C343 67 291 18 219 18C104 18 61 124 51 202L23 199C28 124 27 71 27 47C47 22 122 -16 204 -16C324 -16 428 60 428 174C428 256 379 309 307 360L276 382C223 419 179 455 179 516C179 576 221 632 293 632C379 632 410 564 418 487L448 490C446 536 446 592 449 634Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,6.071,3.132)"><path d="M135 536C164 536 186 560 186 587C186 617 164 639 136 639C109 639 85 617 85 587C85 560 109 536 135 536ZM252 0V26C188 32 181 38 181 106V451C138 433 90 420 39 412V388C99 379 102 374 102 312V106C102 38 95 32 32 26V0H252Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,8.455,3.132)"><path d="M95 130C70 130 46 113 46 88C46 72 54 64 59 64C93 55 121 33 121 -3C121 -41 93 -68 44 -88L55 -117C117 -98 186 -56 186 22C186 91 131 130 95 130Z"></path></g><g transform="matrix(.0091,0,0,-0.0091,10.53,3.132)"><path d="M298 36L289 62C276 55 253 45 228 45C202 45 169 60 169 141V397H276C289 405 292 426 282 437H169V574L155 576L90 509V437H45L17 408L21 397H90V107C90 28 125 -12 188 -12C198 -12 213 -8 230 1L298 36Z"></path></g></svg></span></td><td class="align_center">The similarity of image <i>i</i> and text <i>t</i></td></tr><tr><td class="align_left"><i>i</i></td><td class="align_center">Paired image</td></tr><tr><td class="align_left"><i>t</i></td><td class="align_center">Paired text</td></tr><tr><td class="align_left"><span style="width: 3.66193ptpx;"><svg height="12.1306pt" id="M7" style="vertical-align:-0.2063999pt" version="1.1" viewbox="-0.0498162 -11.9242 3.66193 12.1306" width="3.66193pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,-.437,-2.897)"><path d="M470 557C393 591 314 624 246 666H224C156 624 76 591 0 557L12 532C86 550 163 573 235 596C309 572 384 550 458 532L470 557Z"></path></g><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M244 607C244 633 228 655 200 655C166 655 146 618 146 594C146 564 166 546 191 546C221 546 244 574 244 607ZM222 91L209 114C184 94 148 66 133 66C127 66 124 73 130 96L201 370C213 416 211 448 191 448C162 448 88 407 29 352L42 328C73 354 104 371 114 371C120 371 119 365 115 345L53 92C32 5 45 -12 68 -12C103 -12 186 50 222 91Z"></path></g></svg></span></td><td class="align_center">Not paired image</td></tr><tr><td class="align_left"><span style="width: 4.54925ptpx;"><svg height="11.3085pt" id="M8" style="vertical-align:-0.2063999pt" version="1.1" viewbox="-0.0498162 -11.1021 4.54925 11.3085" width="4.54925pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,-.515,-2.075)"><path d="M470 557C393 591 314 624 246 666H224C156 624 76 591 0 557L12 532C86 550 163 573 235 596C309 572 384 550 458 532L470 557Z"></path></g><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M324 430H196L233 583L223 592L145 529L120 430H54L29 396L31 388H111L56 126C33 15 54 -12 77 -12C137 -12 214 57 250 95L233 119C208 92 155 59 138 59C126 59 120 70 131 125L186 390L298 394L324 430Z"></path></g></svg></span></td><td class="align_center">Not paired text</td></tr><tr class="table-tr"><td colspan="2"><hr class="tbody-hr"/></td></tr></table></td></tr></table>

Mathematical Problems in Engineering

tab1

Table 1

Table 1: Deep Visual Semantic Embedding with Text Data Augmentation and Word Embedding Initialization