-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #182 from tukcomCD2024/Dev-backend
Dev backend
- Loading branch information
Showing
12 changed files
with
325 additions
and
131 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
36 changes: 36 additions & 0 deletions
36
backend/core/src/main/java/com/rollthedice/backend/batch/ScrapBatchScheduler.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
package com.rollthedice.backend.batch; | ||
|
||
import lombok.RequiredArgsConstructor; | ||
import lombok.extern.slf4j.Slf4j; | ||
import org.springframework.batch.core.*; | ||
import org.springframework.batch.core.launch.JobLauncher; | ||
import org.springframework.batch.core.repository.JobExecutionAlreadyRunningException; | ||
import org.springframework.batch.core.repository.JobInstanceAlreadyCompleteException; | ||
import org.springframework.batch.core.repository.JobRepository; | ||
import org.springframework.scheduling.annotation.Scheduled; | ||
import org.springframework.stereotype.Component; | ||
|
||
@Slf4j | ||
@Component | ||
@RequiredArgsConstructor | ||
public class ScrapBatchScheduler { | ||
private final JobLauncher jobLauncher; | ||
private final ScrapJobConfig scrapJobConfig; | ||
private final JobRepository jobRepository; | ||
private final Step crawlingNewsUrlStep; | ||
private final Step crawlingNewsContentStep; | ||
|
||
// @Scheduled(cron = "0 0 6,12 * * *", zone = "Asia/Seoul") // DB 요금 문제로 개발 기간동안 주석 처리 | ||
public void runJob() { | ||
JobParameters jobParameters = new JobParametersBuilder() | ||
.addLong("time", System.currentTimeMillis()) | ||
.toJobParameters(); | ||
try { | ||
jobLauncher.run(scrapJobConfig.scrapJob(jobRepository, | ||
crawlingNewsUrlStep, crawlingNewsContentStep), jobParameters); | ||
} catch (JobExecutionAlreadyRunningException | JobInstanceAlreadyCompleteException | ||
| JobParametersInvalidException | org.springframework.batch.core.repository.JobRestartException e) { | ||
log.error(e.getMessage()); | ||
} | ||
} | ||
} |
118 changes: 118 additions & 0 deletions
118
backend/core/src/main/java/com/rollthedice/backend/batch/ScrapJobConfig.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
package com.rollthedice.backend.batch; | ||
|
||
import com.rollthedice.backend.batch.newsContentStep.PreSummarizedNewsDto; | ||
import com.rollthedice.backend.batch.newsContentStep.UncrawledNewsContentReader; | ||
import com.rollthedice.backend.batch.newsUrlStep.InitNewsDto; | ||
import com.rollthedice.backend.batch.newsUrlStep.NewsUrlReader; | ||
import com.rollthedice.backend.domain.news.contentqueue.ContentProducer; | ||
import com.rollthedice.backend.domain.news.dto.ContentMessageDto; | ||
import com.rollthedice.backend.domain.news.repository.NewsRepository; | ||
import lombok.RequiredArgsConstructor; | ||
import lombok.extern.slf4j.Slf4j; | ||
import org.springframework.batch.core.Job; | ||
import org.springframework.batch.core.Step; | ||
import org.springframework.batch.core.configuration.annotation.JobScope; | ||
import org.springframework.batch.core.configuration.annotation.StepScope; | ||
import org.springframework.batch.core.job.builder.JobBuilder; | ||
import org.springframework.batch.core.launch.support.RunIdIncrementer; | ||
import org.springframework.batch.core.repository.JobRepository; | ||
import org.springframework.batch.core.step.builder.StepBuilder; | ||
import org.springframework.batch.item.ItemProcessor; | ||
import org.springframework.batch.item.ItemReader; | ||
import org.springframework.batch.item.database.JdbcBatchItemWriter; | ||
import org.springframework.batch.item.database.builder.JdbcBatchItemWriterBuilder; | ||
import org.springframework.beans.factory.annotation.Value; | ||
import org.springframework.context.annotation.Bean; | ||
import org.springframework.context.annotation.Configuration; | ||
import org.springframework.transaction.PlatformTransactionManager; | ||
|
||
import javax.sql.DataSource; | ||
|
||
@Configuration | ||
@RequiredArgsConstructor | ||
public class ScrapJobConfig { | ||
|
||
@Value("${batch.chunk-size}") | ||
private int chunkSize; | ||
|
||
private final DataSource dataSource; | ||
private final NewsRepository newsRepository; | ||
private final ContentProducer contentProducer; | ||
|
||
@Bean | ||
public Job scrapJob(JobRepository jobRepository, | ||
Step crawlingNewsUrlStep, Step crawlingNewsContentStep) { | ||
return new JobBuilder("scrapJob", jobRepository) | ||
.incrementer(new RunIdIncrementer()) | ||
.start(crawlingNewsUrlStep) | ||
.next(crawlingNewsContentStep) | ||
.build(); | ||
} | ||
|
||
@Bean | ||
@JobScope | ||
public Step crawlingNewsUrlStep(JobRepository jobRepository, | ||
PlatformTransactionManager transactionManager) { | ||
return new StepBuilder("crawlingNewsUrlStep", jobRepository) | ||
.allowStartIfComplete(true) | ||
.<InitNewsDto, InitNewsDto>chunk(30, transactionManager) | ||
.reader(newsUrlReader()) | ||
.writer(newsUrlWriter()) | ||
.build(); | ||
} | ||
|
||
@Bean | ||
@StepScope | ||
public ItemReader<InitNewsDto> newsUrlReader() { | ||
return new NewsUrlReader(); | ||
} | ||
|
||
@Bean | ||
@StepScope | ||
public JdbcBatchItemWriter<InitNewsDto> newsUrlWriter() { | ||
return new JdbcBatchItemWriterBuilder<InitNewsDto>() | ||
.dataSource(dataSource) | ||
.sql("insert into news(url, category) values (:url, :newsCategory)") | ||
.beanMapped() | ||
.build(); | ||
} | ||
|
||
@Bean | ||
@JobScope | ||
public Step crawlingNewsContentStep(JobRepository jobRepository, | ||
PlatformTransactionManager transactionManager) { | ||
return new StepBuilder("crawlingNewsContentStep", jobRepository) | ||
.allowStartIfComplete(true) | ||
.<PreSummarizedNewsDto, PreSummarizedNewsDto>chunk(chunkSize, transactionManager) | ||
.reader(uncrawledNewsContentReader()) | ||
.processor(summarizeContentProcessor()) | ||
.writer(newsContentWriter()) | ||
.build(); | ||
} | ||
|
||
@Bean | ||
@StepScope | ||
public ItemReader<PreSummarizedNewsDto> uncrawledNewsContentReader() { | ||
return new UncrawledNewsContentReader(newsRepository); | ||
} | ||
|
||
@Bean | ||
@StepScope | ||
public ItemProcessor<PreSummarizedNewsDto, PreSummarizedNewsDto> summarizeContentProcessor() { | ||
return dto -> { | ||
contentProducer.sendMessage(new ContentMessageDto(dto.getId(), dto.getContent())); | ||
return dto; | ||
}; | ||
} | ||
|
||
@Bean | ||
@StepScope | ||
public JdbcBatchItemWriter<PreSummarizedNewsDto> newsContentWriter() { | ||
return new JdbcBatchItemWriterBuilder<PreSummarizedNewsDto>() | ||
.dataSource(dataSource) | ||
.sql("update news set title = :title, content = :content, post_date = :postDate" + | ||
" where id = :id") | ||
.beanMapped() | ||
.build(); | ||
} | ||
} |
14 changes: 14 additions & 0 deletions
14
...ore/src/main/java/com/rollthedice/backend/batch/newsContentStep/PreSummarizedNewsDto.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
package com.rollthedice.backend.batch.newsContentStep; | ||
|
||
import lombok.*; | ||
|
||
@Getter | ||
@NoArgsConstructor | ||
@AllArgsConstructor | ||
@Builder | ||
public class PreSummarizedNewsDto { | ||
private Long id; | ||
private String title; | ||
private String content; | ||
private String postDate; | ||
} |
80 changes: 80 additions & 0 deletions
80
...c/main/java/com/rollthedice/backend/batch/newsContentStep/UncrawledNewsContentReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
package com.rollthedice.backend.batch.newsContentStep; | ||
|
||
import com.rollthedice.backend.domain.news.entity.News; | ||
import com.rollthedice.backend.domain.news.repository.NewsRepository; | ||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Element; | ||
import org.jsoup.select.Elements; | ||
import org.springframework.batch.item.ItemReader; | ||
|
||
import java.io.IOException; | ||
import java.util.Iterator; | ||
|
||
public class UncrawledNewsContentReader implements ItemReader<PreSummarizedNewsDto> { | ||
private final NewsRepository newsRepository; | ||
private Iterator<News> uncrawledNewsContents; | ||
|
||
public UncrawledNewsContentReader(NewsRepository newsRepository) { | ||
this.newsRepository = newsRepository; | ||
uncrawledNewsContents = newsRepository.findAllByContentIsNull().iterator(); | ||
} | ||
|
||
@Override | ||
public PreSummarizedNewsDto read() throws IOException { | ||
if (!hasNextUncrawledNews()) { | ||
return null; | ||
} | ||
News news = uncrawledNewsContents.next(); | ||
Document doc = Jsoup.connect(news.getUrl()).get(); | ||
return getNewsContent(news, doc); | ||
} | ||
|
||
private boolean hasNextUncrawledNews() { | ||
if (!uncrawledNewsContents.hasNext()) { | ||
uncrawledNewsContents = newsRepository.findAllByContentIsNull().iterator(); | ||
} | ||
return uncrawledNewsContents.hasNext(); | ||
} | ||
|
||
private PreSummarizedNewsDto getNewsContent(News news, Document doc) { | ||
return PreSummarizedNewsDto.builder() | ||
.id(news.getId()) | ||
.title(scrapTitle(doc)) | ||
.content(scrapContent(doc)) | ||
.postDate(scrapPostDate(doc)) | ||
.build(); | ||
} | ||
|
||
private String scrapTitle(final Document doc) { | ||
Element titleElement = doc.selectFirst("#ct > div.media_end_head.go_trans > div.media_end_head_title > h2"); | ||
if (titleElement == null) { | ||
titleElement = doc.selectFirst("#content > div.end_ct > div > h2"); | ||
} | ||
if (titleElement != null) { | ||
return titleElement.text(); | ||
} | ||
return null; | ||
} | ||
|
||
private String scrapContent(final Document doc) { | ||
Elements contentElements = doc.select("article#dic_area"); | ||
if (contentElements.isEmpty()) { | ||
contentElements = doc.select("#articeBody"); | ||
} | ||
return contentElements.outerHtml().replaceAll("\\<[^>]*>|\\n", ""); | ||
} | ||
|
||
private String scrapPostDate(final Document doc) { | ||
Element dateElement = doc.selectFirst("div#ct> div.media_end_head.go_trans > div.media_end_head_info.nv_notrans > div.media_end_head_info_datestamp > div > span"); | ||
if (dateElement != null) { | ||
return dateElement.attr("data-date-time"); | ||
} else { | ||
Element altDateElement = doc.selectFirst("#content > div.end_ct > div > div.article_info > span > em"); | ||
if (altDateElement != null) { | ||
return altDateElement.text(); | ||
} | ||
} | ||
return null; | ||
} | ||
} |
15 changes: 15 additions & 0 deletions
15
backend/core/src/main/java/com/rollthedice/backend/batch/newsUrlStep/InitNewsDto.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
package com.rollthedice.backend.batch.newsUrlStep; | ||
|
||
import lombok.AllArgsConstructor; | ||
import lombok.Getter; | ||
import lombok.NoArgsConstructor; | ||
import lombok.Setter; | ||
|
||
@Getter | ||
@NoArgsConstructor | ||
@AllArgsConstructor | ||
@Setter | ||
public class InitNewsDto { | ||
private String newsCategory; | ||
private String url; | ||
} |
53 changes: 53 additions & 0 deletions
53
backend/core/src/main/java/com/rollthedice/backend/batch/newsUrlStep/NewsUrlReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
package com.rollthedice.backend.batch.newsUrlStep; | ||
|
||
import com.rollthedice.backend.domain.news.entity.NewsCategory; | ||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Element; | ||
import org.jsoup.select.Elements; | ||
import org.springframework.batch.item.ItemReader; | ||
import org.springframework.beans.factory.annotation.Value; | ||
|
||
import java.io.IOException; | ||
import java.util.*; | ||
import java.util.stream.Collectors; | ||
|
||
public class NewsUrlReader implements ItemReader<InitNewsDto> { | ||
@Value("${crawling.quantity}") | ||
private int crawlingQuantity; | ||
|
||
private final Iterator<NewsCategory> categories; | ||
private final Queue<InitNewsDto> initNews = new LinkedList<>(); | ||
|
||
public NewsUrlReader() { | ||
categories = Arrays.stream(NewsCategory.values()).collect(Collectors.toList()).iterator(); | ||
} | ||
|
||
@Override | ||
public InitNewsDto read() throws IOException { | ||
while (initNews.isEmpty() && categories.hasNext()) { | ||
NewsCategory category = categories.next(); | ||
initNews.addAll(scrapCategoryNews(category)); | ||
} | ||
return initNews.poll(); | ||
} | ||
|
||
private List<InitNewsDto> scrapCategoryNews(NewsCategory category) throws IOException { | ||
Document doc = Jsoup.connect(category.getCategoryUrl()).get(); | ||
Elements newsList = doc.select(".sa_list").select("li"); | ||
if (newsList.size() < crawlingQuantity) { | ||
return scrapNewsUrl(newsList.size(), newsList, category); | ||
} | ||
return scrapNewsUrl(crawlingQuantity, newsList, category); | ||
} | ||
|
||
private List<InitNewsDto> scrapNewsUrl(int quantity, Elements newsList, NewsCategory category) { | ||
List<InitNewsDto> urls = new ArrayList<>(); | ||
for (int i = 0; i < quantity; i++) { | ||
Element news = newsList.get(i); | ||
String url = Objects.requireNonNull(news.selectFirst(".sa_text_title")).attr("href"); | ||
urls.add(new InitNewsDto(category.getName(), url)); | ||
} | ||
return urls; | ||
} | ||
} |
Oops, something went wrong.