Results 1 to 9 of 9

Thread: Parallel steps issue in a job.

  1. #1
    Join Date
    Dec 2011
    Posts
    8

    Default Parallel steps issue in a job.

    Hi,

    i'm trying to create a simple spring batch project where i have only one JOB with 3 steps. The idea is to run these 3 steps in parallel. I have seen a lot of examples, read in various books about parallel steps in Spring Batch, however the result of the execution is pretty much not as expected, because in runtime i am monitoring the 3 created threads and they are not running concurrently. I would like to understand why and if somebody could point out to a potential solution, it would be great or maybe my XML job cofiguration is not the appropriate one.

    The scenario is simple: Each step reader reads from a different file, passes through a processor (that does nothing in this case) and writes the content into a database table using JDBC.

    Here is my launch-context.xml:

    Code:
    <bean id="jobOperator" class="org.springframework.batch.core.launch.support.SimpleJobOperator"
    		p:jobLauncher-ref="jobLauncher" p:jobExplorer-ref="jobExplorer"	p:jobRepository-ref="jobRepository" p:jobRegistry-ref="jobRegistry" />
    
    	<bean id="jobExplorer" class="org.springframework.batch.core.explore.support.JobExplorerFactoryBean" p:dataSource-ref="dataSource" />
    
    	<bean id="jobRegistry" class="org.springframework.batch.core.configuration.support.MapJobRegistry" />
    
    	<bean	class="org.springframework.batch.core.configuration.support.JobRegistryBeanPostProcessor">
    		<property name="jobRegistry" ref="jobRegistry" />
    	</bean>
    
    	<bean id="jobLauncher" class="org.springframework.batch.core.launch.support.SimpleJobLauncher">
    		<property name="jobRepository" ref="jobRepository" />
    	</bean>
    
    	<bean id="jobRepository" class="org.springframework.batch.core.repository.support.JobRepositoryFactoryBean"
    		p:dataSource-ref="dataSource" p:transactionManager-ref="transactionManager" />
    
    	<bean id="dataSource" class="org.apache.commons.dbcp.BasicDataSource">
    		<property name="driverClassName" value="${batch.jdbc.driver}" />
    		<property name="url" value="${batch.jdbc.url}" />
    		<property name="username" value="${batch.jdbc.user}" />
    		<property name="password" value="${batch.jdbc.password}" />
    	</bean>
    
    	<bean id="transactionManager" class="org.springframework.jdbc.datasource.DataSourceTransactionManager">
    		<property name="dataSource" ref="dataSource" />
    	</bean>
    	
    	<bean id="placeholderProperties" class="org.springframework.beans.factory.config.PropertyPlaceholderConfigurer">
    		<property name="location" value="classpath:batch-default.properties" />
    		<property name="systemPropertiesModeName" value="SYSTEM_PROPERTIES_MODE_OVERRIDE" />
    		<property name="ignoreUnresolvablePlaceholders" value="true" />
    		<property name="order" value="1" />
    	</bean>

    and here is my job-context.xml :
    Code:
    <import resource="../override/app-context.xml"/>
    		
    	<bean id="taskExecutor" class="org.springframework.core.task.SimpleAsyncTaskExecutor"  />
    
    	<batch:step id="file1Step">
    		<batch:tasklet transaction-manager="transactionManager">
    			<batch:chunk reader="inputReader1" processor="processor" writer="outputWriter" commit-interval="1000" />
    		</batch:tasklet>
    	</batch:step>
    	
    	<batch:step id="file2Step">
    		<batch:tasklet transaction-manager="transactionManager">
    			<batch:chunk reader="inputReader2" processor="processor" writer="outputWriter" commit-interval="1000" />
    		</batch:tasklet>
    	</batch:step>
    	
    	<batch:step id="file3Step">
    		<batch:tasklet transaction-manager="transactionManager">
    			<batch:chunk reader="inputReader3" processor="processor" writer="outputWriter" commit-interval="1000" />
    		</batch:tasklet>
    	</batch:step>	
    	
    	<bean id="inputReader1" class="org.springframework.batch.item.file.FlatFileItemReader" scope="step">
    		<property name="resource" value="partition0.csv" />
    		<property name="lineMapper">
    			<bean class="org.springframework.batch.item.file.mapping.DefaultLineMapper">
    				<property name="lineTokenizer">
    					<bean class="org.springframework.batch.item.file.transform.DelimitedLineTokenizer">
    						<property name="names" value="uci,companyName,companyAddrCountry,companyAddrCity,companyAddrStreet,companyAddrPostcode,creditRating" />
    						<property name="delimiter" value=","/>
    					</bean>
    				</property>
    				<property name="fieldSetMapper">
    					<bean class="org.springframework.batch.admin.sample.reader.CompanySetMapper" />
    				</property>
    			</bean>
    		</property>
    	</bean>
    	
    	<bean id="inputReader2" class="org.springframework.batch.item.file.FlatFileItemReader" scope="step">
    		<property name="resource" value="partition1.csv" />
    		<property name="lineMapper">
    			<bean class="org.springframework.batch.item.file.mapping.DefaultLineMapper">
    				<property name="lineTokenizer">
    					<bean class="org.springframework.batch.item.file.transform.DelimitedLineTokenizer">
    						<property name="names" value="uci,companyName,companyAddrCountry,companyAddrCity,companyAddrStreet,companyAddrPostcode,creditRating" />
    						<property name="delimiter" value=","/>
    					</bean>
    				</property>
    				<property name="fieldSetMapper">
    					<bean class="org.springframework.batch.admin.sample.reader.CompanySetMapper" />
    				</property>
    			</bean>
    		</property>
    	</bean>
    	
    	<bean id="inputReader3" class="org.springframework.batch.item.file.FlatFileItemReader" scope="step">
    		<property name="resource" value="partition2.csv" />
    		<property name="lineMapper">
    			<bean class="org.springframework.batch.item.file.mapping.DefaultLineMapper">
    				<property name="lineTokenizer">
    					<bean class="org.springframework.batch.item.file.transform.DelimitedLineTokenizer">
    						<property name="names" value="uci,companyName,companyAddrCountry,companyAddrCity,companyAddrStreet,companyAddrPostcode,creditRating" />
    						<property name="delimiter" value=","/>
    					</bean>
    				</property>
    				<property name="fieldSetMapper">
    					<bean class="org.springframework.batch.admin.sample.reader.CompanySetMapper" />
    				</property>
    			</bean>
    		</property>
    	</bean>
    	
    	<bean id="outputWriter" class="org.springframework.batch.item.database.JdbcBatchItemWriter" scope="step" >
    		<property name="dataSource" ref="dataSource"/>
    		<property name="sql" value="insert into company_rating (uci, company_Name, company_Addr_Country, company_Addr_City, company_Addr_Street, company_Addr_Postcode, credit_Rating) values (?, ?, ?, ?, ?, ?,?)" />
    		<property name="itemPreparedStatementSetter" ref="preparedStatementSetter"/>
    	</bean>
    	<bean id="preparedStatementSetter" class="org.springframework.batch.admin.sample.writer.CompanyPreparedStatementSetter" />
    	
    	<bean id="processor" class="org.springframework.batch.admin.sample.processor.SampleItemProcessor" scope="step" />
    	
    	<batch:job id="parallelJob">
    		<batch:split id="parallelProcessing"  task-executor="taskExecutor">
    			<batch:flow>
    				<batch:step id="step1" parent="file1Step" />
    			</batch:flow>
    			<batch:flow>
    				<batch:step id="step2" parent="file2Step" />
    			</batch:flow>
    			<batch:flow>
    				<batch:step id="step3" parent="file3Step" />
    			</batch:flow>
    		</batch:split>
    	</batch:job>
    Thank you, and looking forward to your answer.

  2. #2

    Default

    i almost completely recreated your usecase over at my github repository, see https://github.com/langmi/spring-bat...-flows-job.xml

    only relevant differences

    • HSQLDB in-memory database vs your (supposedly) production grade db2/oracle/etc.
    • no-op writer vs your jdbc writer
    • simple txt files vs your csv
    • (supposedly) smaller files - only 20 lines each
    • i have no
      Code:
      <import resource="../override/app-context.xml"/>
    • simple WinXP 32bit dualcore system (i can test on 16core/64bit if needed)


    i used netbeans profiling to take a look at the threads

    i have three threads from the taskexecutor

    • SimpleAsyncTaskExecutor-1
    • SimpleAsyncTaskExecutor-2
    • SimpleAsyncTaskExecutor-3


    and for each thread the profiler says

    0:03.032: Started
    0:03.422: Monitor
    0:03.641: Finished
    according to the timeline view, all 3 threads did run indeed in parallel

    because all my processor/writer implementations log, i have a log too and it further confirms to the above data

    2011-12-30 10:49:13,990 DEBUG SimpleAsyncTaskExecutor-3 [SimpleItemWriter] - <8>
    2011-12-30 10:49:13,990 DEBUG SimpleAsyncTaskExecutor-3 [SimpleItemWriter] - <9>
    2011-12-30 10:49:13,990 DEBUG SimpleAsyncTaskExecutor-3 [SimpleItemWriter] - <10>
    2011-12-30 10:49:13,990 DEBUG SimpleAsyncTaskExecutor-2 [SimpleItemProcessor] - <process:6>
    2011-12-30 10:49:13,990 DEBUG SimpleAsyncTaskExecutor-2 [SimpleItemProcessor] - <process:7>
    2011-12-30 10:49:13,990 DEBUG SimpleAsyncTaskExecutor-2 [SimpleItemProcessor] - <process:8>
    2011-12-30 10:49:13,990 DEBUG SimpleAsyncTaskExecutor-2 [SimpleItemProcessor] - <process:9>
    2011-12-30 10:49:13,990 DEBUG SimpleAsyncTaskExecutor-2 [SimpleItemProcessor] - <process:10>
    2011-12-30 10:49:13,990 DEBUG SimpleAsyncTaskExecutor-2 [SimpleItemWriter] - <writing:>
    2011-12-30 10:49:13,990 DEBUG SimpleAsyncTaskExecutor-2 [SimpleItemWriter] - <6>
    2011-12-30 10:49:13,990 DEBUG SimpleAsyncTaskExecutor-2 [SimpleItemWriter] - <7>
    2011-12-30 10:49:13,990 DEBUG SimpleAsyncTaskExecutor-2 [SimpleItemWriter] - <8>
    2011-12-30 10:49:13,990 DEBUG SimpleAsyncTaskExecutor-2 [SimpleItemWriter] - <9>
    2011-12-30 10:49:13,990 DEBUG SimpleAsyncTaskExecutor-2 [SimpleItemWriter] - <10>
    2011-12-30 10:49:13,990 DEBUG SimpleAsyncTaskExecutor-1 [SimpleItemProcessor] - <process:2>
    2011-12-30 10:49:13,990 DEBUG SimpleAsyncTaskExecutor-1 [SimpleItemProcessor] - <process:3>

    soo

    because in runtime i am monitoring the 3 created threads and they are not running concurrently
    there some questions/pointers to find the source of the problem

    • how do you monitor the created threads and how do you see there are not running concurrently?
    • is your usecase IO bound? how large are the used files?
    • what runtime is used ?
    • did you check for bottlenecks? e.g. the org.springframework.batch.admin.sample.reader.Comp anySetMapper or the jdbc writer



    ps: i tested it with a synchronized write() method too, no differences
    Last edited by michael.lange; Dec 30th, 2011 at 04:11 AM.

  3. #3
    Join Date
    Dec 2011
    Posts
    8

    Default

    Hi Michael,

    first i would like to thank you for your quick reply.

    So, to add more details about the project and resources..

    1) i use Windows 7 Enterprise SP1, x64, Intel Core2Duo;
    2) i am monitoring the created threads using VisualVM 1.3.3 and indeed there are 3 threads running in parallel, however, only one of them is active (processing) at a time, and this seems to be the problem;
    3) the files are 10MB each;
    4) i use the applicaton in a JBoss 7 AS web container;
    5) I didn't check for bottlenecks, but initially i thought the problem was the writing to DB, then i switched the writing to 3 different text files, and the execution time was the same.

    I am attaching a small image here, so you can understand better what i mean by "running in parallel but executing one thread at a time" :
    Maybe i am missing something very small and important on all this configuration.

    threads.jpg

  4. #4

    Default

    well each thread uses the same BasicDataSource, without pooling i guess this is the bottleneck (waiting for next connection, executing sql)

    can you try it out with a commit-interval of 1 ? the waiting times should be much smaller


    (*)in my provided code example you can simple mimic the bottleneck by making the writer synchronized and add a wait(1000), the timeline-view will look similar to yours

  5. #5
    Join Date
    Dec 2011
    Posts
    8

    Default

    i have tried it with a commit-interval of 1 and the waiting time seems to be much smaller however, the overall execution time is much bigger.

    So, about BasicDataSource, is anything that i can change/add to make this example work completely in parallel without any thread wait time ?

    About pooling, in this example what should i change exactly ? (Don't have much experience with connection pooling).

    Thank you for your reply.

  6. #6

    Default

    take a look at c3p0 or even better use the jboss datasource capabilities

  7. #7
    Join Date
    Dec 2011
    Posts
    8

    Default

    Thanks a lot Michael.

  8. #8
    Join Date
    Dec 2011
    Posts
    8

    Default

    Hi again Michael,

    i've been experimenting your suggestions for the past few days with configurating a jboss datasource and force spring batch to use it. Still, i am not achieving the result that i want. I experimented with JtaTransactionManager and it seems that there is somewhere a bottleneck. Nothing has changed, the threads still run at the same time but only one of them is processing. I would really like to solve this problem, if you could suggest something else or probably a missing configuration i would be very grateful.

    I'm posting here the part of the application-context.xml that i changed to use JTA Transaction:
    Code:
            <jee:jndi-lookup id="dataSource" jndi-name="java:/batch" lookup-on-startup="true" resource-ref="true" cache="true" />
    
            <bean id="jbossTransactionManager" class="com.arjuna.ats.internal.jta.transaction.arjunacore.TransactionManagerImple" />
    	<bean id="jbossUserTransaction" class="com.arjuna.ats.internal.jta.transaction.arjunacore.UserTransactionImple" />
    
    	<bean id="transactionManager"	class="org.springframework.transaction.jta.JtaTransactionManager">
    		<property name="transactionManager">
    			<ref bean="jbossTransactionManager" />
    		</property>
    		<property name="userTransaction">
    			<ref bean="jbossUserTransaction" />
    		</property>
    		<property name="allowCustomIsolationLevels" value="true" />
    	</bean>
    and the Datasource subsystem from the standalone.xml on JBoss AS 7.0.2:
    Code:
                    <datasource jndi-name="java:/batch" pool-name="batch-pool" enabled="true" jta="true" use-java-context="true" use-ccm="true">
                        <connection-url>
                            jdbc:oracle:thin:@srv:1521:sid
                        </connection-url>
                        <driver>
                            oracle
                        </driver>
                        <transaction-isolation>
                            TRANSACTION_READ_COMMITTED
                        </transaction-isolation>
                        <pool>
                            <min-pool-size>
                                1
                            </min-pool-size>
                            <max-pool-size>
                                100
                            </max-pool-size>
                            <prefill>
                                true
                            </prefill>
                            <use-strict-min>
                                false
                            </use-strict-min>
                            <flush-strategy>
                                FailingConnectionOnly
                            </flush-strategy>
                        </pool>
                        <security>
                            <user-name>
                                batch
                            </user-name>
                            <password>
                                solution
                            </password>
                        </security>
                        <statement>
                            <prepared-statement-cache-size>
                                0
                            </prepared-statement-cache-size>
                            <share-prepared-statements/>
                        </statement>
                    </datasource>
                    <drivers>
                        <driver name="oracle" module="com.oracle">
                            <xa-datasource-class>
                                oracle.jdbc.xa.client.OracleXADataSource
                            </xa-datasource-class>
                        </driver>
                    </drivers>
    and the module.xml:
    Code:
    <module xmlns="urn:jboss:module:1.0" name="com.oracle">
      <resources>
        <resource-root path="ojdbc6.jar"/>
      </resources>
      <dependencies>
        <module name="javax.api"/>
      </dependencies>
    </module>

  9. #9
    Join Date
    Dec 2011
    Posts
    8

    Default

    solved. The bottleneck was the logging, writing to console.. very stupid.

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •