Commit c720adb8 authored by Libretro-Admin's avatar Libretro-Admin
Browse files

use openmp for texture upscaling

parent 31c1cafc
......@@ -11,7 +11,7 @@ HAVE_GENERIC_JIT := 1
HAVE_GL3 := 0
FORCE_GLES := 0
STATIC_LINKING:= 0
HAVE_TEXUPSCALE := 0
HAVE_TEXUPSCALE := 1
ifeq ($(HAVE_OIT), 1)
TARGET_NAME := reicast_oit
......@@ -26,6 +26,7 @@ CC_AS = ${CC_PREFIX}as
MFLAGS :=
ASFLAGS :=
LDFLAGS :=
LDFLAGS_END :=
INCFLAGS :=
LIBS :=
CFLAGS :=
......@@ -536,6 +537,9 @@ endif
ifeq ($(HAVE_TEXUPSCALE), 1)
CORE_DEFINES += -DHAVE_TEXUPSCALE
CXXFLAGS += -fopenmp
LDFAGS += -fopenmp
LDFLAGS_END += -Wl,-Bstatic -lgmp -Wl,-Bstatic -lgomp
NEED_CXX11=1
NEED_PTHREAD=1
endif
......@@ -600,7 +604,7 @@ endif
CFLAGS += $(fpic)
CXXFLAGS += $(fpic)
LDFLAGS += $(fpic)
$(LDFLAGS_END) LDFLAGS += $(fpic)
OBJECTS := $(SOURCES_CXX:.cpp=.o) $(SOURCES_C:.c=.o) $(SOURCES_ASM:.S=.o)
......@@ -617,7 +621,7 @@ $(TARGET): $(OBJECTS)
ifeq ($(STATIC_LINKING), 1)
$(AR) rcs $@ $(OBJECTS)
else
$(LD) $(MFLAGS) $(fpic) $(SHARED) $(LDFLAGS) $(OBJECTS) $(GL_LIB) $(LIBS) -o $@
$(LD) $(MFLAGS) $(fpic) $(SHARED) $(LDFLAGS) $(OBJECTS) $(LDFLAGS_END) $(GL_LIB) $(LIBS) -o $@
endif
%.o: %.cpp
......
#ifdef HAVE_TEXUPSCALE
#include <list>
#include <functional>
#include <omp.h>
#endif
#include "TexCache.h"
#include "hw/pvr/pvr_regs.h"
#include "hw/pvr/ta.h"
#include "hw/mem/_vmem.h"
#ifdef HAVE_TEXUPSCALE
#include "deps/ctpl/ctpl_stl.h"
#include "deps/xbrz/xbrz.h"
#endif
......@@ -16,10 +17,6 @@ bool KillTex=false;
u32 palette16_ram[1024];
u32 palette32_ram[1024];
#ifdef HAVE_TEXUPSCALE
ctpl::thread_pool ThreadPool(1);
#endif
u32 detwiddle[2][8][1024];
//input : address in the yyyyyxxxxx format
//output : address in the xyxyxyxy format
......@@ -352,35 +349,18 @@ static void deposterizeV(u32* data, u32* out, int w, int h, int l, int u) {
void parallelize(const std::function<void(int,int)> &func, int start, int end, int width /* = 0 */)
{
int max_threads = max(1, (int)settings.pvr.MaxThreads);
if (ThreadPool.size() != max_threads)
ThreadPool.resize(max_threads);
static const int CHUNK = 8; // 32x32 best if not parall'ed (chunk >= 32)
// 8: 0.0481391 ms
// 16: 0.068005 ms
// 32: 0.0265986 ms
// 1024x512 best is 8 (or 16)
// 4: 2.19 ms
// 8: 229 - 241 Mpix/s 2.16 ms 2.185 2.183 2.11
// 16: 163 - 175 Mpix/s 2.16 ms 2.145 2.185 2.144
// 32: 129 - 142 Mpix/s 2.19 ms
// 64: 4.34 ms
const int chunk_size = width == 0 ? CHUNK : max(CHUNK, CHUNK * 128 / width);
if (end - start <= chunk_size)
int tcount = omp_get_num_procs() - 1;
if (tcount < 1)
tcount = 1;
tcount = min(tcount, (int)settings.pvr.MaxThreads);
#pragma omp parallel num_threads(tcount)
{
// Don't parallelize if there isn't much to parallelize
func(start, end);
}
else
{
std::list<std::future<void>> futures;
for (int i = start; i < end; i += chunk_size)
futures.push_back(ThreadPool.push([func] (int id, int from, int to){ func(from, to); }, i, i + chunk_size));
for (auto it = futures.begin(); it != futures.end(); ++it)
it->wait();
int num_threads = omp_get_num_threads();
int thread = omp_get_thread_num();
int chunk = (end - start) / num_threads;
func(start + chunk * thread,
num_threads == thread + 1 ? end
: (start + chunk * (thread + 1)));
}
}
......@@ -406,6 +386,5 @@ void UpscalexBRZ(int factor, u32* source, u32* dest, int width, int height, bool
void shutdown_thread_pool()
{
ThreadPool.stop();
}
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment