drm/radeon/kms: optimize CS state checking for r100->r500
The colorbuffer, zbuffer, and texture states are checked only once when
they get changed. This improves performance in the apps which emit
lots of draw packets and few state changes.
This drops performance in glxgears by a 1% or so, but glxgears is not
a benchmark we care about.
The time spent in the kernel when running Torcs dropped from 33% to 23%
and the frame rate is higher, which is a good thing.
r600 might need something like this as well.
Signed-off-by: Marek Olšák <maraeo@gmail.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
index 5f15820..fdf4bc6 100644
--- a/drivers/gpu/drm/radeon/r100.c
+++ b/drivers/gpu/drm/radeon/r100.c
@@ -1427,6 +1427,7 @@
}
track->zb.robj = reloc->robj;
track->zb.offset = idx_value;
+ track->zb_dirty = true;
ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset);
break;
case RADEON_RB3D_COLOROFFSET:
@@ -1439,6 +1440,7 @@
}
track->cb[0].robj = reloc->robj;
track->cb[0].offset = idx_value;
+ track->cb_dirty = true;
ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset);
break;
case RADEON_PP_TXOFFSET_0:
@@ -1454,6 +1456,7 @@
}
ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset);
track->textures[i].robj = reloc->robj;
+ track->tex_dirty = true;
break;
case RADEON_PP_CUBIC_OFFSET_T0_0:
case RADEON_PP_CUBIC_OFFSET_T0_1:
@@ -1471,6 +1474,7 @@
track->textures[0].cube_info[i].offset = idx_value;
ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset);
track->textures[0].cube_info[i].robj = reloc->robj;
+ track->tex_dirty = true;
break;
case RADEON_PP_CUBIC_OFFSET_T1_0:
case RADEON_PP_CUBIC_OFFSET_T1_1:
@@ -1488,6 +1492,7 @@
track->textures[1].cube_info[i].offset = idx_value;
ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset);
track->textures[1].cube_info[i].robj = reloc->robj;
+ track->tex_dirty = true;
break;
case RADEON_PP_CUBIC_OFFSET_T2_0:
case RADEON_PP_CUBIC_OFFSET_T2_1:
@@ -1505,9 +1510,12 @@
track->textures[2].cube_info[i].offset = idx_value;
ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset);
track->textures[2].cube_info[i].robj = reloc->robj;
+ track->tex_dirty = true;
break;
case RADEON_RE_WIDTH_HEIGHT:
track->maxy = ((idx_value >> 16) & 0x7FF);
+ track->cb_dirty = true;
+ track->zb_dirty = true;
break;
case RADEON_RB3D_COLORPITCH:
r = r100_cs_packet_next_reloc(p, &reloc);
@@ -1528,9 +1536,11 @@
ib[idx] = tmp;
track->cb[0].pitch = idx_value & RADEON_COLORPITCH_MASK;
+ track->cb_dirty = true;
break;
case RADEON_RB3D_DEPTHPITCH:
track->zb.pitch = idx_value & RADEON_DEPTHPITCH_MASK;
+ track->zb_dirty = true;
break;
case RADEON_RB3D_CNTL:
switch ((idx_value >> RADEON_RB3D_COLOR_FORMAT_SHIFT) & 0x1f) {
@@ -1555,6 +1565,8 @@
return -EINVAL;
}
track->z_enabled = !!(idx_value & RADEON_Z_ENABLE);
+ track->cb_dirty = true;
+ track->zb_dirty = true;
break;
case RADEON_RB3D_ZSTENCILCNTL:
switch (idx_value & 0xf) {
@@ -1572,6 +1584,7 @@
default:
break;
}
+ track->zb_dirty = true;
break;
case RADEON_RB3D_ZPASS_ADDR:
r = r100_cs_packet_next_reloc(p, &reloc);
@@ -1588,6 +1601,7 @@
uint32_t temp = idx_value >> 4;
for (i = 0; i < track->num_texture; i++)
track->textures[i].enabled = !!(temp & (1 << i));
+ track->tex_dirty = true;
}
break;
case RADEON_SE_VF_CNTL:
@@ -1602,12 +1616,14 @@
i = (reg - RADEON_PP_TEX_SIZE_0) / 8;
track->textures[i].width = (idx_value & RADEON_TEX_USIZE_MASK) + 1;
track->textures[i].height = ((idx_value & RADEON_TEX_VSIZE_MASK) >> RADEON_TEX_VSIZE_SHIFT) + 1;
+ track->tex_dirty = true;
break;
case RADEON_PP_TEX_PITCH_0:
case RADEON_PP_TEX_PITCH_1:
case RADEON_PP_TEX_PITCH_2:
i = (reg - RADEON_PP_TEX_PITCH_0) / 8;
track->textures[i].pitch = idx_value + 32;
+ track->tex_dirty = true;
break;
case RADEON_PP_TXFILTER_0:
case RADEON_PP_TXFILTER_1:
@@ -1621,6 +1637,7 @@
tmp = (idx_value >> 27) & 0x7;
if (tmp == 2 || tmp == 6)
track->textures[i].roundup_h = false;
+ track->tex_dirty = true;
break;
case RADEON_PP_TXFORMAT_0:
case RADEON_PP_TXFORMAT_1:
@@ -1673,6 +1690,7 @@
}
track->textures[i].cube_info[4].width = 1 << ((idx_value >> 16) & 0xf);
track->textures[i].cube_info[4].height = 1 << ((idx_value >> 20) & 0xf);
+ track->tex_dirty = true;
break;
case RADEON_PP_CUBIC_FACES_0:
case RADEON_PP_CUBIC_FACES_1:
@@ -1683,6 +1701,7 @@
track->textures[i].cube_info[face].width = 1 << ((tmp >> (face * 8)) & 0xf);
track->textures[i].cube_info[face].height = 1 << ((tmp >> ((face * 8) + 4)) & 0xf);
}
+ track->tex_dirty = true;
break;
default:
printk(KERN_ERR "Forbidden register 0x%04X in cs at %d\n",
@@ -3318,9 +3337,9 @@
unsigned long size;
unsigned prim_walk;
unsigned nverts;
- unsigned num_cb = track->num_cb;
+ unsigned num_cb = track->cb_dirty ? track->num_cb : 0;
- if (!track->zb_cb_clear && !track->color_channel_mask &&
+ if (num_cb && !track->zb_cb_clear && !track->color_channel_mask &&
!track->blend_read_enable)
num_cb = 0;
@@ -3341,7 +3360,9 @@
return -EINVAL;
}
}
- if (track->z_enabled) {
+ track->cb_dirty = false;
+
+ if (track->zb_dirty && track->z_enabled) {
if (track->zb.robj == NULL) {
DRM_ERROR("[drm] No buffer for z buffer !\n");
return -EINVAL;
@@ -3358,6 +3379,8 @@
return -EINVAL;
}
}
+ track->zb_dirty = false;
+
prim_walk = (track->vap_vf_cntl >> 4) & 0x3;
if (track->vap_vf_cntl & (1 << 14)) {
nverts = track->vap_alt_nverts;
@@ -3417,13 +3440,22 @@
prim_walk);
return -EINVAL;
}
- return r100_cs_track_texture_check(rdev, track);
+
+ if (track->tex_dirty) {
+ track->tex_dirty = false;
+ return r100_cs_track_texture_check(rdev, track);
+ }
+ return 0;
}
void r100_cs_track_clear(struct radeon_device *rdev, struct r100_cs_track *track)
{
unsigned i, face;
+ track->cb_dirty = true;
+ track->zb_dirty = true;
+ track->tex_dirty = true;
+
if (rdev->family < CHIP_R300) {
track->num_cb = 1;
if (rdev->family <= CHIP_RS200)
diff --git a/drivers/gpu/drm/radeon/r100_track.h b/drivers/gpu/drm/radeon/r100_track.h
index af65600..ee85c4a 100644
--- a/drivers/gpu/drm/radeon/r100_track.h
+++ b/drivers/gpu/drm/radeon/r100_track.h
@@ -52,14 +52,7 @@
unsigned compress_format;
};
-struct r100_cs_track_limits {
- unsigned num_cb;
- unsigned num_texture;
- unsigned max_levels;
-};
-
struct r100_cs_track {
- struct radeon_device *rdev;
unsigned num_cb;
unsigned num_texture;
unsigned maxy;
@@ -78,6 +71,10 @@
bool separate_cube;
bool zb_cb_clear;
bool blend_read_enable;
+
+ bool cb_dirty;
+ bool zb_dirty;
+ bool tex_dirty;
};
int r100_cs_track_check(struct radeon_device *rdev, struct r100_cs_track *track);
diff --git a/drivers/gpu/drm/radeon/r200.c b/drivers/gpu/drm/radeon/r200.c
index d2408c3..f240583 100644
--- a/drivers/gpu/drm/radeon/r200.c
+++ b/drivers/gpu/drm/radeon/r200.c
@@ -184,6 +184,7 @@
}
track->zb.robj = reloc->robj;
track->zb.offset = idx_value;
+ track->zb_dirty = true;
ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset);
break;
case RADEON_RB3D_COLOROFFSET:
@@ -196,6 +197,7 @@
}
track->cb[0].robj = reloc->robj;
track->cb[0].offset = idx_value;
+ track->cb_dirty = true;
ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset);
break;
case R200_PP_TXOFFSET_0:
@@ -214,6 +216,7 @@
}
ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset);
track->textures[i].robj = reloc->robj;
+ track->tex_dirty = true;
break;
case R200_PP_CUBIC_OFFSET_F1_0:
case R200_PP_CUBIC_OFFSET_F2_0:
@@ -257,9 +260,12 @@
track->textures[i].cube_info[face - 1].offset = idx_value;
ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset);
track->textures[i].cube_info[face - 1].robj = reloc->robj;
+ track->tex_dirty = true;
break;
case RADEON_RE_WIDTH_HEIGHT:
track->maxy = ((idx_value >> 16) & 0x7FF);
+ track->cb_dirty = true;
+ track->zb_dirty = true;
break;
case RADEON_RB3D_COLORPITCH:
r = r100_cs_packet_next_reloc(p, &reloc);
@@ -280,9 +286,11 @@
ib[idx] = tmp;
track->cb[0].pitch = idx_value & RADEON_COLORPITCH_MASK;
+ track->cb_dirty = true;
break;
case RADEON_RB3D_DEPTHPITCH:
track->zb.pitch = idx_value & RADEON_DEPTHPITCH_MASK;
+ track->zb_dirty = true;
break;
case RADEON_RB3D_CNTL:
switch ((idx_value >> RADEON_RB3D_COLOR_FORMAT_SHIFT) & 0x1f) {
@@ -312,6 +320,8 @@
}
track->z_enabled = !!(idx_value & RADEON_Z_ENABLE);
+ track->cb_dirty = true;
+ track->zb_dirty = true;
break;
case RADEON_RB3D_ZSTENCILCNTL:
switch (idx_value & 0xf) {
@@ -329,6 +339,7 @@
default:
break;
}
+ track->zb_dirty = true;
break;
case RADEON_RB3D_ZPASS_ADDR:
r = r100_cs_packet_next_reloc(p, &reloc);
@@ -345,6 +356,7 @@
uint32_t temp = idx_value >> 4;
for (i = 0; i < track->num_texture; i++)
track->textures[i].enabled = !!(temp & (1 << i));
+ track->tex_dirty = true;
}
break;
case RADEON_SE_VF_CNTL:
@@ -369,6 +381,7 @@
i = (reg - R200_PP_TXSIZE_0) / 32;
track->textures[i].width = (idx_value & RADEON_TEX_USIZE_MASK) + 1;
track->textures[i].height = ((idx_value & RADEON_TEX_VSIZE_MASK) >> RADEON_TEX_VSIZE_SHIFT) + 1;
+ track->tex_dirty = true;
break;
case R200_PP_TXPITCH_0:
case R200_PP_TXPITCH_1:
@@ -378,6 +391,7 @@
case R200_PP_TXPITCH_5:
i = (reg - R200_PP_TXPITCH_0) / 32;
track->textures[i].pitch = idx_value + 32;
+ track->tex_dirty = true;
break;
case R200_PP_TXFILTER_0:
case R200_PP_TXFILTER_1:
@@ -394,6 +408,7 @@
tmp = (idx_value >> 27) & 0x7;
if (tmp == 2 || tmp == 6)
track->textures[i].roundup_h = false;
+ track->tex_dirty = true;
break;
case R200_PP_TXMULTI_CTL_0:
case R200_PP_TXMULTI_CTL_1:
@@ -432,6 +447,7 @@
track->textures[i].tex_coord_type = 1;
break;
}
+ track->tex_dirty = true;
break;
case R200_PP_TXFORMAT_0:
case R200_PP_TXFORMAT_1:
@@ -488,6 +504,7 @@
}
track->textures[i].cube_info[4].width = 1 << ((idx_value >> 16) & 0xf);
track->textures[i].cube_info[4].height = 1 << ((idx_value >> 20) & 0xf);
+ track->tex_dirty = true;
break;
case R200_PP_CUBIC_FACES_0:
case R200_PP_CUBIC_FACES_1:
@@ -501,6 +518,7 @@
track->textures[i].cube_info[face].width = 1 << ((tmp >> (face * 8)) & 0xf);
track->textures[i].cube_info[face].height = 1 << ((tmp >> ((face * 8) + 4)) & 0xf);
}
+ track->tex_dirty = true;
break;
default:
printk(KERN_ERR "Forbidden register 0x%04X in cs at %d\n",
diff --git a/drivers/gpu/drm/radeon/r300.c b/drivers/gpu/drm/radeon/r300.c
index 55fe5ba..15f9464 100644
--- a/drivers/gpu/drm/radeon/r300.c
+++ b/drivers/gpu/drm/radeon/r300.c
@@ -667,6 +667,7 @@
}
track->cb[i].robj = reloc->robj;
track->cb[i].offset = idx_value;
+ track->cb_dirty = true;
ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset);
break;
case R300_ZB_DEPTHOFFSET:
@@ -679,6 +680,7 @@
}
track->zb.robj = reloc->robj;
track->zb.offset = idx_value;
+ track->zb_dirty = true;
ib[idx] = idx_value + ((u32)reloc->lobj.gpu_offset);
break;
case R300_TX_OFFSET_0:
@@ -717,6 +719,7 @@
tmp |= tile_flags;
ib[idx] = tmp;
track->textures[i].robj = reloc->robj;
+ track->tex_dirty = true;
break;
/* Tracked registers */
case 0x2084:
@@ -743,6 +746,8 @@
if (p->rdev->family < CHIP_RV515) {
track->maxy -= 1440;
}
+ track->cb_dirty = true;
+ track->zb_dirty = true;
break;
case 0x4E00:
/* RB3D_CCTL */
@@ -752,6 +757,7 @@
return -EINVAL;
}
track->num_cb = ((idx_value >> 5) & 0x3) + 1;
+ track->cb_dirty = true;
break;
case 0x4E38:
case 0x4E3C:
@@ -814,6 +820,7 @@
((idx_value >> 21) & 0xF));
return -EINVAL;
}
+ track->cb_dirty = true;
break;
case 0x4F00:
/* ZB_CNTL */
@@ -822,6 +829,7 @@
} else {
track->z_enabled = false;
}
+ track->zb_dirty = true;
break;
case 0x4F10:
/* ZB_FORMAT */
@@ -838,6 +846,7 @@
(idx_value & 0xF));
return -EINVAL;
}
+ track->zb_dirty = true;
break;
case 0x4F24:
/* ZB_DEPTHPITCH */
@@ -861,6 +870,7 @@
ib[idx] = tmp;
track->zb.pitch = idx_value & 0x3FFC;
+ track->zb_dirty = true;
break;
case 0x4104:
for (i = 0; i < 16; i++) {
@@ -869,6 +879,7 @@
enabled = !!(idx_value & (1 << i));
track->textures[i].enabled = enabled;
}
+ track->tex_dirty = true;
break;
case 0x44C0:
case 0x44C4:
@@ -951,8 +962,8 @@
DRM_ERROR("Invalid texture format %u\n",
(idx_value & 0x1F));
return -EINVAL;
- break;
}
+ track->tex_dirty = true;
break;
case 0x4400:
case 0x4404:
@@ -980,6 +991,7 @@
if (tmp == 2 || tmp == 4 || tmp == 6) {
track->textures[i].roundup_h = false;
}
+ track->tex_dirty = true;
break;
case 0x4500:
case 0x4504:
@@ -1017,6 +1029,7 @@
DRM_ERROR("Forbidden bit TXFORMAT_MSB\n");
return -EINVAL;
}
+ track->tex_dirty = true;
break;
case 0x4480:
case 0x4484:
@@ -1046,6 +1059,7 @@
track->textures[i].use_pitch = !!tmp;
tmp = (idx_value >> 22) & 0xF;
track->textures[i].txdepth = tmp;
+ track->tex_dirty = true;
break;
case R300_ZB_ZPASS_ADDR:
r = r100_cs_packet_next_reloc(p, &reloc);
@@ -1060,6 +1074,7 @@
case 0x4e0c:
/* RB3D_COLOR_CHANNEL_MASK */
track->color_channel_mask = idx_value;
+ track->cb_dirty = true;
break;
case 0x43a4:
/* SC_HYPERZ_EN */
@@ -1073,6 +1088,8 @@
case 0x4f1c:
/* ZB_BW_CNTL */
track->zb_cb_clear = !!(idx_value & (1 << 5));
+ track->cb_dirty = true;
+ track->zb_dirty = true;
if (p->rdev->hyperz_filp != p->filp) {
if (idx_value & (R300_HIZ_ENABLE |
R300_RD_COMP_ENABLE |
@@ -1084,6 +1101,7 @@
case 0x4e04:
/* RB3D_BLENDCNTL */
track->blend_read_enable = !!(idx_value & (1 << 2));
+ track->cb_dirty = true;
break;
case 0x4f28: /* ZB_DEPTHCLEARVALUE */
break;