if not share_query_proj:
# giving each query vector different projection matrix (one-to-one)
self.query = GroupLinearLayer(
hidden_size, kdim * num_heads, num_hidden)
else:
# giving each query vector different projection matrix (one-to-one)
self.query = SharedGroupLinearLayer(
hidden_size, kdim * num_heads, num_hidden)
# all query share the same projection *proj*
SharedGroupLinearLayer is not what's intended.