A Symmetric Fusion Learning Model for Detecting Visual Relations and Scene Parsing

<table class="table-group" id="tab3"><tr><td><table class="table"><tr><td class="thead-hr" colspan="6"><hr/></td></tr><tr class="thead"><td class="align_left" rowspan="2"><i>K</i></td><td class="align_center" rowspan="2">Method</td><td class="align_center" colspan="2">Relation detection</td><td class="align_center" colspan="2">Phrase detection</td></tr><tr class="thead"><td class="align_center">R@50</td><td class="align_center">R@100</td><td class="align_center">R@50</td><td class="align_center">R@100</td></tr><tr><td class="thead-hr" colspan="6"><hr/></td></tr><tr><td class="align_left" rowspan="3">1</td><td class="align_center">Baseline</td><td class="align_center">25.39</td><td class="align_center">29.67</td><td class="align_center">31.50</td><td class="align_center">37.00</td></tr><tr><td class="align_center">Baseline + <svg height="12.7178pt" id="M117" style="vertical-align:-3.42947pt" version="1.1" viewbox="-0.0498162 -9.28833 11.7913 12.7178" width="11.7913pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M619 670C619 686 593 712 555 712S459 686 410 634S335 504 320 430H250L219 400L222 388H312L258 73C223 -133 201 -166 187 -180C175 -191 158 -199 140 -199C123 -199 88 -188 74 -172C68 -166 63 -164 54 -171C38 -185 23 -201 23 -215C23 -236 60 -261 93 -261C122 -261 161 -247 207 -200C268 -138 300 -71 337 94C365 220 376 277 394 387L501 399L521 430H401C432 623 464 665 501 665C524 665 544 651 567 627C577 617 583 618 592 625C601 631 619 651 619 670Z"></path></g><g transform="matrix(.013,0,0,-0.013,8.156,0)"><path d="M238 681C243 705 239 712 230 712C217 712 156 682 75 674L70 648H105C148 648 153 641 144 598L39 110C18 11 35 -12 55 -12C90 -12 166 36 221 103L205 125C174 93 130 65 118 65C112 65 108 68 114 96L238 681Z"></path></g></svg></td><td class="align_center">25.62</td><td class="align_center">29.92</td><td class="align_center">31.45</td><td class="align_center">37.21</td></tr><tr><td class="align_center">Baseline + <svg height="12.7178pt" id="M118" style="vertical-align:-3.42947pt" version="1.1" viewbox="-0.0498162 -9.28833 11.7913 12.7178" width="11.7913pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M619 670C619 686 593 712 555 712S459 686 410 634S335 504 320 430H250L219 400L222 388H312L258 73C223 -133 201 -166 187 -180C175 -191 158 -199 140 -199C123 -199 88 -188 74 -172C68 -166 63 -164 54 -171C38 -185 23 -201 23 -215C23 -236 60 -261 93 -261C122 -261 161 -247 207 -200C268 -138 300 -71 337 94C365 220 376 277 394 387L501 399L521 430H401C432 623 464 665 501 665C524 665 544 651 567 627C577 617 583 618 592 625C601 631 619 651 619 670Z"></path></g><g transform="matrix(.013,0,0,-0.013,8.156,0)"><path d="M238 681C243 705 239 712 230 712C217 712 156 682 75 674L70 648H105C148 648 153 641 144 598L39 110C18 11 35 -12 55 -12C90 -12 166 36 221 103L205 125C174 93 130 65 118 65C112 65 108 68 114 96L238 681Z"></path></g></svg> + <svg height="9.49473pt" id="M119" style="vertical-align:-0.2063999pt" version="1.1" viewbox="-0.0498162 -9.28833 8.63347 9.49473" width="8.63347pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M352 391C352 416 319 448 267 448C236 448 173 423 147 400C107 364 96 332 96 304C96 248 143 210 193 181C241 153 258 124 258 100C258 72 232 38 184 38C151 38 107 66 81 108C77 114 64 116 55 111C34 99 23 84 23 65C23 29 81 -12 134 -12C220 -12 325 61 325 141C325 184 297 215 234 256C194 282 161 309 161 346C161 380 188 401 217 401C255 401 279 380 301 353C308 344 313 341 325 347C341 355 352 371 352 391Z"></path></g><g transform="matrix(.013,0,0,-0.013,5.006,0)"><path d="M238 681C243 705 239 712 230 712C217 712 156 682 75 674L70 648H105C148 648 153 641 144 598L39 110C18 11 35 -12 55 -12C90 -12 166 36 221 103L205 125C174 93 130 65 118 65C112 65 108 68 114 96L238 681Z"></path></g></svg></td><td class="align_center">26.01</td><td class="align_center">29.90</td><td class="align_center">32.02</td><td class="align_center">37.31</td></tr><tr><td class="align_left" colspan="6"><hr/></td></tr><tr><td class="align_left" rowspan="3">70</td><td class="align_center">Baseline</td><td class="align_center">27.38</td><td class="align_center">34.33</td><td class="align_center">33.56</td><td class="align_center">41.90</td></tr><tr><td class="align_center">Baseline + <svg height="12.7178pt" id="M120" style="vertical-align:-3.42947pt" version="1.1" viewbox="-0.0498162 -9.28833 11.7913 12.7178" width="11.7913pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M619 670C619 686 593 712 555 712S459 686 410 634S335 504 320 430H250L219 400L222 388H312L258 73C223 -133 201 -166 187 -180C175 -191 158 -199 140 -199C123 -199 88 -188 74 -172C68 -166 63 -164 54 -171C38 -185 23 -201 23 -215C23 -236 60 -261 93 -261C122 -261 161 -247 207 -200C268 -138 300 -71 337 94C365 220 376 277 394 387L501 399L521 430H401C432 623 464 665 501 665C524 665 544 651 567 627C577 617 583 618 592 625C601 631 619 651 619 670Z"></path></g><g transform="matrix(.013,0,0,-0.013,8.156,0)"><path d="M238 681C243 705 239 712 230 712C217 712 156 682 75 674L70 648H105C148 648 153 641 144 598L39 110C18 11 35 -12 55 -12C90 -12 166 36 221 103L205 125C174 93 130 65 118 65C112 65 108 68 114 96L238 681Z"></path></g></svg></td><td class="align_center">27.38</td><td class="align_center">34.81</td><td class="align_center">33.91</td><td class="align_center">42.46</td></tr><tr><td class="align_center">Baseline + <svg height="12.7178pt" id="M121" style="vertical-align:-3.42947pt" version="1.1" viewbox="-0.0498162 -9.28833 11.7913 12.7178" width="11.7913pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M619 670C619 686 593 712 555 712S459 686 410 634S335 504 320 430H250L219 400L222 388H312L258 73C223 -133 201 -166 187 -180C175 -191 158 -199 140 -199C123 -199 88 -188 74 -172C68 -166 63 -164 54 -171C38 -185 23 -201 23 -215C23 -236 60 -261 93 -261C122 -261 161 -247 207 -200C268 -138 300 -71 337 94C365 220 376 277 394 387L501 399L521 430H401C432 623 464 665 501 665C524 665 544 651 567 627C577 617 583 618 592 625C601 631 619 651 619 670Z"></path></g><g transform="matrix(.013,0,0,-0.013,8.156,0)"><path d="M238 681C243 705 239 712 230 712C217 712 156 682 75 674L70 648H105C148 648 153 641 144 598L39 110C18 11 35 -12 55 -12C90 -12 166 36 221 103L205 125C174 93 130 65 118 65C112 65 108 68 114 96L238 681Z"></path></g></svg> + <svg height="9.49473pt" id="M122" style="vertical-align:-0.2063999pt" version="1.1" viewbox="-0.0498162 -9.28833 8.63347 9.49473" width="8.63347pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><g transform="matrix(.013,0,0,-0.013,0,0)"><path d="M352 391C352 416 319 448 267 448C236 448 173 423 147 400C107 364 96 332 96 304C96 248 143 210 193 181C241 153 258 124 258 100C258 72 232 38 184 38C151 38 107 66 81 108C77 114 64 116 55 111C34 99 23 84 23 65C23 29 81 -12 134 -12C220 -12 325 61 325 141C325 184 297 215 234 256C194 282 161 309 161 346C161 380 188 401 217 401C255 401 279 380 301 353C308 344 313 341 325 347C341 355 352 371 352 391Z"></path></g><g transform="matrix(.013,0,0,-0.013,5.006,0)"><path d="M238 681C243 705 239 712 230 712C217 712 156 682 75 674L70 648H105C148 648 153 641 144 598L39 110C18 11 35 -12 55 -12C90 -12 166 36 221 103L205 125C174 93 130 65 118 65C112 65 108 68 114 96L238 681Z"></path></g></svg></td><td class="align_center">28.63</td><td class="align_center">35.21</td><td class="align_center">34.88</td><td class="align_center">43.07</td></tr><tr class="table-tr"><td colspan="6"><hr class="tbody-hr"/></td></tr></table></td></tr></table>

<div>Ablation studies on the key components of our method. We report the phrase and relationship detection performance in R@n scores (R@50 and R@100).</div>

Scientific Programming

tab3

Table 3

Table 3: A Symmetric Fusion Learning Model for Detecting Visual Relations and Scene Parsing